Main Page   Class Hierarchy   Alphabetical List   Compound List   Examples  
itparser.h
1 #ifndef _MIMETIC_PARSER_ITPARSER_H_
2 #define _MIMETIC_PARSER_ITPARSER_H_
3 #include <iterator>
4 #include <algorithm>
5 #include <stack>
6 #include <iostream>
7 #include <mimetic/tree.h>
8 #include <mimetic/utils.h>
9 #include <mimetic/mimeentity.h>
10 
11 
12 // FIXME: handle HigherLevelClosingBoundary
13 
14 namespace mimetic
15 {
16 
17 /// Parse the input reading from an iterator
18 template<typename Iterator,
19 typename ItCategory=typename std::iterator_traits<Iterator>::iterator_category>
21 {
22 };
23 
24 /*
25  * Input Iterator
26  */
27 template<typename Iterator>
28 struct IteratorParser<Iterator, std::input_iterator_tag>
29 {
30 
32  : m_me(me), m_iMask(imNone), m_lastBoundary(NoBoundary)
33  {
34  m_entityStack.push(&m_me);
35  }
36  virtual ~IteratorParser()
37  {
38  }
39  /**
40  * set the Ignore Mask to \p mask
41  */
42  void iMask(size_t mask) { m_iMask = mask; }
43  /**
44  * get the Ignore Mask
45  */
46  size_t iMask() const { return m_iMask; }
47  /**
48  * start parsing
49  */
50  void run(Iterator bit, Iterator eit)
51  {
52  m_bit = bit;
53  m_eit = eit;
54  doLoad();
55  }
56 protected:
57  typedef std::list<std::string> BoundaryList;
58  enum {
59  CR = 0xD,
60  LF = 0xA,
61  NL = '\n'
62  };
63  enum /* ParsingElem */ {
64  peIgnore,
65  pePreamble,
66  peBody,
67  peEpilogue
68  };
69  enum BoundaryType {
70  NoBoundary = 0,
71  Boundary,
72  ClosingBoundary,
73  HigherLevelBoundary
74  //, HigherLevelClosingBoundary
75  };
76  enum EntityType {
77  etRfc822,
78  etMsgRfc822,
79  etMultipart
80  };
81  // vars
82  MimeEntity& m_me;
83  Iterator m_bit, m_eit;
84  size_t m_iMask; // ignore mask
85  BoundaryList m_boundaryList;
86  BoundaryType m_lastBoundary;
87  std::stack<MimeEntity*> m_entityStack;
88 
89 protected:
90  void appendPreambleBlock(const char* buf, int sz)
91  {
92  MimeEntity* pMe = m_entityStack.top();
93  pMe->body().preamble().append(buf,sz);
94  }
95 
96  void appendEpilogueBlock(const char* buf, int sz)
97  {
98  MimeEntity* pMe = m_entityStack.top();
99  pMe->body().epilogue().append(buf,sz);
100  }
101 
102  void appendBodyBlock(const char* buf, int sz)
103  {
104  MimeEntity* pMe = m_entityStack.top();
105  pMe->body().append(buf, sz);
106  }
107 
108  std::string getBoundary()
109  {
110  const MimeEntity* pMe = m_entityStack.top();
111  const ContentType& ct = pMe->header().contentType();
112  return std::string("--") + ct.param("boundary");
113  }
114 
115  void popChild()
116  {
117  m_entityStack.pop();
118  }
119 
120  void pushNewChild()
121  {
122  MimeEntity* pMe = m_entityStack.top();
123  MimeEntity* pChild = new MimeEntity;
124  pMe->body().parts().push_back(pChild);
125  m_entityStack.push(pChild);
126  }
127 
128  EntityType getType()
129  {
130  MimeEntity* pMe = m_entityStack.top();
131  const Header& h = pMe->header();
132  // will NOT be automatically created if it doesn't exists;
133  // null ContentType will be returned
134  const ContentType& ct = h.contentType();
135  if(ct.isMultipart())
136  return etMultipart;
137  else if (ct.type() == "message" && ct.subtype() == "rfc822")
138  return etMsgRfc822;
139  else
140  return etRfc822;
141  }
142 
143  void addField(const std::string& name, const std::string& value)
144  {
145  MimeEntity* pMe = m_entityStack.top();
146  Header& h = pMe->header();
147  Header::iterator it = h.insert(h.end(), Field());
148  it->name(name);
149  it->value(value);
150  }
151 
152  BoundaryType isBoundary(const std::string& line)
153  {
154  if(line.length() == 0 || line[0] != '-')
155  return m_lastBoundary = NoBoundary;
156 
157  int level = 0; // multipart nesting level
158  int lineLen = line.length();
159  BoundaryList::const_iterator bit,eit;
160  bit = m_boundaryList.begin(), eit = m_boundaryList.end();
161  for(;bit != eit; ++bit, ++level)
162  {
163  const std::string& b = *bit;
164  int bLen = b.length();
165  if(line.compare(0, bLen, b) == 0)
166  {
167  // not the expected boundary, malformed msg
168  if(level > 0)
169  return m_lastBoundary=HigherLevelBoundary;
170  // plain boundary or closing boundary?
171  if(lineLen > bLen && line.compare(bLen,2,"--") == 0)
172  return m_lastBoundary = ClosingBoundary;
173  else
174  return m_lastBoundary = Boundary;
175  }
176  }
177  return m_lastBoundary = NoBoundary;
178  }
179  // is new line
180  inline bool isnl(char c) const
181  {
182  return (c == CR || c == LF);
183  }
184  // is a two char newline
185  inline bool isnl(char a, char b) const
186  {
187  if(a == CR || a == LF)
188  if(b == (a == CR ? LF : CR))
189  return true;
190  return false;
191  }
192  void doLoad()
193  {
194  loadHeader();
195  loadBody();
196  }
197  bool valid() const
198  {
199  return m_bit != m_eit;
200  }
201  void append(char*& buf, size_t& bufsz, char c, size_t& pos)
202  {
203  enum { alloc_block = 128};
204  if(pos == bufsz)
205  {
206  // allocate and init buffer
207  char* tmp = buf;
208  int oldBufsz = bufsz;
209  while(pos >= bufsz)
210  bufsz = bufsz + alloc_block;
211  buf = new char[bufsz+1];
212  if(tmp != 0)
213  {
214  assert(oldBufsz > 0);
215  memset(buf, 0, bufsz);
216  memcpy(buf, tmp, oldBufsz);
217  delete[] tmp;
218  }
219  }
220  buf[pos++] = c;
221  }
222  // parses the header and calls addField and pushChild
223  // to add fields and nested entities
224  void loadHeader()
225  {
226  enum {
227  sInit,
228  sIgnoreLine,
229  sNewline,
230  sWaitingName,
231  sWaitingValue,
232  sWaitingFoldedValue,
233  sName,
234  sValue,
235  sIgnoreHeader
236  };
237  register int status;
238  int pos;
239  char *name, *value;
240  size_t nBufSz, vBufSz, nPos, vPos;
241  char prev, c = 0;
242 
243  name = value = 0;
244  pos = nBufSz = vBufSz = nPos = vPos = 0;
245  status = (m_iMask & imHeader ? sIgnoreHeader : sInit);
246  //status = sInit;
247  while(m_bit != m_eit)
248  {
249  c = *m_bit;
250  switch(status)
251  {
252  case sInit:
253  if(isnl(c))
254  status = sNewline;
255  else
256  status = sName;
257  continue;
258  case sIgnoreLine:
259  if(!isnl(c))
260  break;
261  status = sNewline;
262  continue;
263  case sNewline:
264  status = sWaitingName;
265  if(pos > 0)
266  {
267  pos = 0;
268  prev = c;
269  if(++m_bit == m_eit) goto out; //eof
270  c = *m_bit;
271  if(c == (prev == CR ? LF : CR))
272  {
273  --pos;
274  break;
275  } else
276  continue;
277  } else {
278  // empty line, end of header
279  prev = c;
280  if(++m_bit == m_eit) goto out; //eof
281  c = *m_bit;
282  if(c == (prev == CR ? LF : CR))
283  ++m_bit;
284  goto out;
285  }
286  case sWaitingName:
287  if(isblank(c))
288  {
289  // folded value
290  status = sWaitingFoldedValue;
291  continue;
292  }
293  // not blank, new field or empty line
294  if(nPos)
295  {
296  name[nPos] = 0;
297  // is not an empty field (name: \n)
298  if(vPos)
299  {
300  value[vPos] = 0;
301  addField(name,value);
302  } else
303  addField(name,"");
304  nPos = vPos = 0;
305  }
306  status = (isnl(c) ? sNewline : sName);
307  continue;
308  case sWaitingValue:
309  if(isblank(c))
310  break; // eat leading blanks
311  status = sValue;
312  continue;
313  case sWaitingFoldedValue:
314  if(isblank(c))
315  break; // eat leading blanks
316  append(value, vBufSz, ' ', vPos);
317  status = sValue;
318  continue;
319  case sName:
320  if(c > 32 && c < 127 && c != ':') {
321  if(nPos > 0 && isblank(name[nPos-1]))
322  {
323  /* "FIELDNAME BLANK+ c" found, consider that the first
324  body line */
325  onBlock(name, nPos, peBody);
326  goto out;
327  }
328  append(name, nBufSz, c, nPos);
329  } else if(c == ':') {
330  if(nPos == 0)
331  {
332  /* header line starting with ':', ignore the line */
333  status = sIgnoreLine;
334  continue;
335  }
336 
337  /* malformed fix: remove any trailing blanks of the field
338  name */
339  while(nPos > 0 && isblank(name[nPos-1]))
340  nPos--;
341 
342  status = sWaitingValue;
343  } else if(isblank(c)) {
344  /* blank after the field name -> malformed; it may be a
345  malformed field with trailing blank or
346  the start of the body; save the char so we can try to
347  recover later trimming the field name or push the
348  whole line to the body part with onBlock() */
349  append(name, nBufSz, c, nPos);
350  } else {
351  /* bad header line or blank line between header and body is
352  missing; consider we're in the first line of the body */
353  onBlock(name, nPos, peBody);
354  goto out;
355  }
356  break;
357  case sValue:
358  if(isnl(c))
359  {
360  status = sNewline;
361  continue;
362  }
363  append(value, vBufSz, c, vPos);
364  break;
365  case sIgnoreHeader:
366  if(isnl(c))
367  {
368  prev = c;
369  if(++m_bit == m_eit) goto out; //eof
370  c = *m_bit;
371  if(c == (prev == CR ? LF : CR))
372  ++m_bit;
373  if(pos == 0)
374  goto out; //empty line, eoh
375  pos = 0;
376  continue;
377  }
378  break;
379  }
380  ++m_bit; ++pos;
381  }
382  out:
383  if(name)
384  delete[] name;
385  if(value)
386  delete[] value;
387  return;
388  }
389  void loadBody()
390  {
391  switch(getType())
392  {
393  case etRfc822:
394  if(m_iMask & imBody)
395  jump_to_next_boundary();
396  else
397  copy_until_boundary(peBody);
398  break;
399  case etMultipart:
400  loadMultipart();
401  break;
402  case etMsgRfc822:
403  if(m_iMask & imChildParts)
404  jump_to_next_boundary();
405  else {
406  pushNewChild();
407  doLoad(); // load child entities
408  popChild();
409  }
410  break;
411  }
412  }
413  void loadMultipart()
414  {
415  std::string boundary = getBoundary();
416  m_boundaryList.push_front(boundary);
417  ParsingElem pe;
418  // preamble
419  pe = (m_iMask & imPreamble ? peIgnore : pePreamble );
420  copy_until_boundary(pe);
421  while(m_bit != m_eit)
422  {
423  switch(m_lastBoundary)
424  {
425  case NoBoundary:
426  return; // eof
427  case Boundary:
428  if(m_iMask & imChildParts)
429  jump_to_next_boundary();
430  else {
431  pushNewChild();
432  doLoad();
433  popChild();
434  }
435  break;
436  case ClosingBoundary:
437  m_boundaryList.erase(m_boundaryList.begin());
438  // epilogue
439  pe=(m_iMask & imEpilogue? peIgnore: peEpilogue);
440  copy_until_boundary(pe);
441  return;
442  case HigherLevelBoundary:
443  m_boundaryList.erase(m_boundaryList.begin());
444  return;
445  }
446  }
447  }
448  inline void onBlock(const char* block, int sz, ParsingElem pe)
449  {
450  switch(pe)
451  {
452  case peIgnore:
453  return;
454  case pePreamble:
455  appendPreambleBlock(block, sz);
456  break;
457  case peEpilogue:
458  appendEpilogueBlock(block, sz);
459  break;
460  case peBody:
461  appendBodyBlock(block, sz);
462  break;
463  }
464  }
465  void jump_to_next_boundary()
466  {
467  copy_until_boundary(peIgnore);
468  }
469  // this is where most of execution time is spent when parsing
470  // large messages; I'm using a plain char[] buffer instead of
471  // std::string because I want to be as fast as possible here
472  virtual void copy_until_boundary(ParsingElem pe)
473  {
474  size_t pos, lines, eomsz = 0;
475  register char c;
476  enum { nlsz = 1 };
477  const char *eom = 0;
478 
479  enum { blksz = 4096 };
480  char block[blksz];
481  size_t blkpos = 0;
482  size_t sl_off = 0; // start of line offset into *block
483 
484  pos = lines = 0;
485  while(m_bit != m_eit)
486  {
487  // if buffer is full
488  if(blkpos >= blksz - 2 - nlsz)
489  {
490  if(sl_off == 0)
491  {
492  // very long line found, assume it
493  // can't be a boundary and flush the buf
494  // with the partial line
495  block[blkpos] = 0;
496  onBlock(block, blkpos, pe);
497  blkpos = sl_off = 0;
498  } else {
499  // flush the buffer except the last
500  // (probably incomplete) line
501  size_t llen = blkpos - sl_off;
502  onBlock(block, sl_off, pe);
503  memmove(block, block + sl_off, llen);
504  sl_off = 0;
505  blkpos = llen;
506  }
507  }
508  c = *m_bit;
509  if(isnl(c))
510  {
511  char nlbuf[3] = { 0, 0, 0 };
512 
513  nlbuf[0] = c; // save the current NL char in nlbuf
514 
515  // save the second char of the NL sequence (if any) in nlbuf
516  if(++m_bit != m_eit)
517  {
518  char next = *m_bit;
519  if(next == (c == CR ? LF : CR))
520  {
521  nlbuf[1] = next; // save the next char in the NL seq
522  ++m_bit;
523  }
524  }
525 
526  if(pos)
527  {
528  // not an empty row, is this a boundary?
529  block[blkpos] = 0;
530  if(block[sl_off] == '-' && sl_off < blkpos &&
531  block[sl_off+1] == '-')
532  {
533  std::string Line(block+sl_off, blkpos-sl_off);
534  if(isBoundary(Line))
535  {
536  // trim last newline
537  if (sl_off>=2)
538  {
539  int i = sl_off;
540  char a = block[--i];
541  char b = block[--i];
542 
543  if(isnl(a,b))
544  sl_off -= 2;
545  else if(isnl(a))
546  sl_off--;
547 
548  } else if (sl_off==1 && isnl(block[0])) {
549  sl_off--;
550  }
551  onBlock(block, sl_off, pe);
552  return;
553  }
554  }
555  // exit if this is the end of message
556  // marker
557  if(eom && pos >= eomsz)
558  {
559  char *line = block + sl_off;
560  size_t i = 0;
561  for(; i < eomsz; i++)
562  if(eom[i] != line[i])
563  break;
564  if(i==eomsz) // if eom found
565  {
566  onBlock(block, sl_off,
567  pe);
568  return;
569  }
570  }
571  }
572  // append the saved NL sequence
573  for(int i = 0; nlbuf[i] != 0; i++)
574  block[blkpos++] = nlbuf[i];
575  block[blkpos] = 0;
576  sl_off = blkpos;
577  pos = 0;
578  } else {
579  pos++; // line pos
580  block[blkpos++] = c;
581  ++m_bit;
582  }
583  }
584  // eof
585  block[blkpos] = 0;
586  onBlock(block, blkpos, pe);
587  }
588 };
589 
590 
591 /*
592  * Forward Iterator
593  */
594 template<typename Iterator>
595 struct IteratorParser<Iterator, std::forward_iterator_tag>:
596  public IteratorParser<Iterator, std::input_iterator_tag>
597 {
598  /* input_iterator ops
599  * *it = xxx
600  * X& op++
601  * X& op++(int)
602  */
603  typedef IteratorParser<Iterator, std::input_iterator_tag> base_type;
604  IteratorParser(MimeEntity& me)
605  : base_type(me)
606  {
607  }
608 };
609 
610 /*
611  * Bidirectional Iterator
612  */
613 template<typename Iterator>
614 struct IteratorParser<Iterator, std::bidirectional_iterator_tag>:
615  public IteratorParser<Iterator, std::forward_iterator_tag>
616 {
617  typedef IteratorParser<Iterator, std::forward_iterator_tag> base_type;
618  IteratorParser(MimeEntity& me)
619  : base_type(me)
620  {
621  }
622 };
623 
624 /*
625  * Random Access Iterator
626  */
627 template<typename Iterator>
628 struct IteratorParser<Iterator, std::random_access_iterator_tag>:
629  public IteratorParser<Iterator, std::bidirectional_iterator_tag>
630 {
631  typedef IteratorParser<Iterator, std::bidirectional_iterator_tag> base_type;
632  IteratorParser(MimeEntity& me)
633  : base_type(me)
634  {
635  }
636 private:
637  using base_type::peIgnore;
638  using base_type::pePreamble;
639  using base_type::peBody;
640  using base_type::peEpilogue;
641 
642  using base_type::NoBoundary;
643  using base_type::Boundary;
644  using base_type::ClosingBoundary;
645  using base_type::HigherLevelBoundary;
646 
647  using base_type::m_boundaryList;
648  using base_type::m_lastBoundary;
649  using base_type::m_entityStack;
650  using base_type::m_me;
651  using base_type::m_iMask;
652  using base_type::m_bit;
653  using base_type::m_eit;
654  using base_type::isnl;
655 
656  typedef TreeNode<char> BoundaryTree;
657  inline void onBlock(Iterator bit, int size, ParsingElem pe)
658  {
659  if(pe == peIgnore)
660  return;
661  Iterator eit = bit + size;
662  MimeEntity* pMe = m_entityStack.top();
663  switch(pe)
664  {
665  case pePreamble:
666  pMe->body().preamble().append(bit, eit);
667  break;
668  case peEpilogue:
669  pMe->body().epilogue().append(bit, eit);
670  break;
671  case peBody:
672  pMe->body().append(bit, eit);
673  break;
674  }
675  }
676  void copy_until_boundary(ParsingElem pe)
677  {
678  // if we don't have any boundary copy until m_eit and return
679  if(m_boundaryList.empty())
680  {
681  onBlock(m_bit, m_eit-m_bit, pe);
682  m_bit = m_eit;
683  return;
684  }
685  // search for current boundary; if not found (i.e. malformed
686  // message) repeat the search for higher level boundary
687  // (slow just for malformed msg, very fast otherwise)
688  typename base_type::BoundaryList::const_iterator
689  bBit = m_boundaryList.begin(), bEit = m_boundaryList.end();
690  m_lastBoundary = NoBoundary;
691  int depth = 0;
692  for( ;bBit != bEit; ++bBit, ++depth)
693  {
694  const std::string& boundary = *bBit;
695  Iterator off;
696  if( (off=utils::find_bm(m_bit,m_eit,boundary)) != m_eit)
697  {
698  Iterator base = m_bit;
699  size_t block_sz = off - base;
700  m_lastBoundary =
701  (depth ? HigherLevelBoundary: Boundary);
702  off += boundary.length();
703  m_bit = off;
704  if(off<m_eit-1 && *off =='-' && *(off+1) == '-')
705  {
706  m_lastBoundary = ClosingBoundary;
707  m_bit = off + 2;
708  }
709  if(m_bit < m_eit-1 && isnl(*m_bit))
710  {
711  char c = *m_bit++;
712  char next = *m_bit;
713  if(isnl(next) && next != c)
714  ++m_bit;
715  }
716 
717  // trim last newline
718  if(block_sz)
719  {
720  Iterator p = base + block_sz;
721  char a = *--p, b = *--p;
722  if(isnl(a,b))
723  block_sz -= 2;
724  else if(isnl(a))
725  block_sz--;
726  }
727  onBlock(base, block_sz, pe);
728  return;
729  } else {
730  onBlock(m_bit, m_eit-m_bit, pe);
731  m_bit = m_eit;
732  }
733  }
734  }
735  BoundaryTree m_boundaryTree;
736  void buildBoundaryTree()
737  {
738  m_boundaryTree = BoundaryTree(); // clear
739  typename base_type::BoundaryList::const_iterator
740  bit = m_boundaryList.begin(), eit = m_boundaryList.end();
741  BoundaryTree::NodeList *pChilds;
742  BoundaryTree::NodeList::iterator it;
743  int depth = 0;
744  for( ; bit != eit; ++bit)
745  {
746  pChilds = &m_boundaryTree.childList();
747  it = pChilds->begin();
748  const char *w = bit->c_str();
749  do
750  {
751  it = find_if(pChilds->begin(), pChilds->end(),
752  FindNodePred<char>(*w));
753  if( it == pChilds->end() )
754  it = pChilds->insert(pChilds->end(),*w);
755  pChilds = &it->childList();
756  depth++;
757  } while(*(++w));
758  }
759  }
760 
761 };
762 
763 }
764 
765 #endif
Represent a MIME entity.
Definition: mimeentity.h:37
Parse the input reading from an iterator.
Definition: itparser.h:20