1   
  2   
  3  from __future__ import absolute_import 
  4   
  5  import difflib 
  6  from lxml import etree 
  7  from lxml.html import fragment_fromstring 
  8  import re 
  9   
 10  __all__ = ['html_annotate', 'htmldiff'] 
 11   
 12  try: 
 13      from html import escape as html_escape 
 14  except ImportError: 
 15      from cgi import escape as html_escape 
 16  try: 
 17      _unicode = unicode 
 18  except NameError: 
 19       
 20      _unicode = str 
 21  try: 
 22      basestring 
 23  except NameError: 
 24       
 25      basestring = str 
 26   
 27   
 28   
 29   
 30   
 32      return '<span title="%s">%s</span>' % ( 
 33          html_escape(_unicode(version), 1), text) 
  34   
 36      """ 
 37      doclist should be ordered from oldest to newest, like:: 
 38   
 39          >>> version1 = 'Hello World' 
 40          >>> version2 = 'Goodbye World' 
 41          >>> print(html_annotate([(version1, 'version 1'), 
 42          ...                      (version2, 'version 2')])) 
 43          <span title="version 2">Goodbye</span> <span title="version 1">World</span> 
 44   
 45      The documents must be *fragments* (str/UTF8 or unicode), not 
 46      complete documents 
 47   
 48      The markup argument is a function to markup the spans of words. 
 49      This function is called like markup('Hello', 'version 2'), and 
 50      returns HTML.  The first argument is text and never includes any 
 51      markup.  The default uses a span with a title: 
 52   
 53          >>> print(default_markup('Some Text', 'by Joe')) 
 54          <span title="by Joe">Some Text</span> 
 55      """ 
 56       
 57       
 58       
 59       
 60       
 61      tokenlist = [tokenize_annotated(doc, version) 
 62                   for doc, version in doclist] 
 63      cur_tokens = tokenlist[0] 
 64      for tokens in tokenlist[1:]: 
 65          html_annotate_merge_annotations(cur_tokens, tokens) 
 66          cur_tokens = tokens 
 67   
 68       
 69       
 70      cur_tokens = compress_tokens(cur_tokens) 
 71       
 72      result = markup_serialize_tokens(cur_tokens, markup) 
 73      return ''.join(result).strip() 
  74   
 76      """Tokenize a document and add an annotation attribute to each token 
 77      """ 
 78      tokens = tokenize(doc, include_hrefs=False) 
 79      for tok in tokens:  
 80          tok.annotation = annotation 
 81      return tokens 
 82   
 84      """Merge the annotations from tokens_old into tokens_new, when the 
 85      tokens in the new document already existed in the old document. 
 86      """ 
 87      s = InsensitiveSequenceMatcher(a=tokens_old, b=tokens_new) 
 88      commands = s.get_opcodes() 
 89   
 90      for command, i1, i2, j1, j2 in commands: 
 91          if command == 'equal':  
 92              eq_old = tokens_old[i1:i2] 
 93              eq_new = tokens_new[j1:j2] 
 94              copy_annotations(eq_old, eq_new) 
 95   
 97      """ 
 98      Copy annotations from the tokens listed in src to the tokens in dest 
 99      """ 
100      assert len(src) == len(dest) 
101      for src_tok, dest_tok in zip(src, dest):  
102          dest_tok.annotation = src_tok.annotation 
103   
105      """ 
106      Combine adjacent tokens when there is no HTML between the tokens,  
107      and they share an annotation 
108      """ 
109      result = [tokens[0]]  
110      for tok in tokens[1:]:  
111          if (not result[-1].post_tags and  
112              not tok.pre_tags and  
113              result[-1].annotation == tok.annotation):  
114              compress_merge_back(result, tok) 
115          else:  
116              result.append(tok) 
117      return result 
 118   
120      """ Merge tok into the last element of tokens (modifying the list of 
121      tokens in-place).  """ 
122      last = tokens[-1] 
123      if type(last) is not token or type(tok) is not token:  
124          tokens.append(tok) 
125      else: 
126          text = _unicode(last) 
127          if last.trailing_whitespace: 
128              text += last.trailing_whitespace 
129          text += tok 
130          merged = token(text, 
131                         pre_tags=last.pre_tags, 
132                         post_tags=tok.post_tags, 
133                         trailing_whitespace=tok.trailing_whitespace) 
134          merged.annotation = last.annotation 
135          tokens[-1] = merged 
136       
138      """ 
139      Serialize the list of tokens into a list of text chunks, calling 
140      markup_func around text to add annotations. 
141      """ 
142      for token in tokens: 
143          for pre in token.pre_tags: 
144              yield pre 
145          html = token.html() 
146          html = markup_func(html, token.annotation) 
147          if token.trailing_whitespace: 
148              html += token.trailing_whitespace 
149          yield html 
150          for post in token.post_tags: 
151              yield post 
 152   
153   
154   
155   
156   
157   
159       
160       
161      """ Do a diff of the old and new document.  The documents are HTML 
162      *fragments* (str/UTF8 or unicode), they are not complete documents 
163      (i.e., no <html> tag). 
164   
165      Returns HTML with <ins> and <del> tags added around the 
166      appropriate text.   
167   
168      Markup is generally ignored, with the markup from new_html 
169      preserved, and possibly some markup from old_html (though it is 
170      considered acceptable to lose some of the old markup).  Only the 
171      words in the HTML are diffed.  The exception is <img> tags, which 
172      are treated like words, and the href attribute of <a> tags, which 
173      are noted inside the tag itself when there are changes. 
174      """  
175      old_html_tokens = tokenize(old_html) 
176      new_html_tokens = tokenize(new_html) 
177      result = htmldiff_tokens(old_html_tokens, new_html_tokens) 
178      result = ''.join(result).strip() 
179      return fixup_ins_del_tags(result) 
 180   
182      """ Does a diff on the tokens themselves, returning a list of text 
183      chunks (not tokens). 
184      """ 
185       
186       
187       
188       
189       
190       
191       
192       
193       
194       
195       
196       
197       
198      s = InsensitiveSequenceMatcher(a=html1_tokens, b=html2_tokens) 
199      commands = s.get_opcodes() 
200      result = [] 
201      for command, i1, i2, j1, j2 in commands: 
202          if command == 'equal': 
203              result.extend(expand_tokens(html2_tokens[j1:j2], equal=True)) 
204              continue 
205          if command == 'insert' or command == 'replace': 
206              ins_tokens = expand_tokens(html2_tokens[j1:j2]) 
207              merge_insert(ins_tokens, result) 
208          if command == 'delete' or command == 'replace': 
209              del_tokens = expand_tokens(html1_tokens[i1:i2]) 
210              merge_delete(del_tokens, result) 
211       
212       
213       
214       
215      result = cleanup_delete(result) 
216   
217      return result 
 218   
220      """Given a list of tokens, return a generator of the chunks of 
221      text for the data in the tokens. 
222      """ 
223      for token in tokens: 
224          for pre in token.pre_tags: 
225              yield pre 
226          if not equal or not token.hide_when_equal: 
227              if token.trailing_whitespace: 
228                  yield token.html() + token.trailing_whitespace 
229              else: 
230                  yield token.html() 
231          for post in token.post_tags: 
232              yield post 
 233   
235      """ doc is the already-handled document (as a list of text chunks); 
236      here we add <ins>ins_chunks</ins> to the end of that.  """ 
237       
238       
239       
240      unbalanced_start, balanced, unbalanced_end = split_unbalanced(ins_chunks) 
241      doc.extend(unbalanced_start) 
242      if doc and not doc[-1].endswith(' '): 
243           
244           
245          doc[-1] += ' ' 
246      doc.append('<ins>') 
247      if balanced and balanced[-1].endswith(' '): 
248           
249          balanced[-1] = balanced[-1][:-1] 
250      doc.extend(balanced) 
251      doc.append('</ins> ') 
252      doc.extend(unbalanced_end) 
 253   
254   
255   
256   
261   
263      """ Raised when the document no longer contains any pending deletes 
264      (DEL_START/DEL_END) """ 
 265   
267      """ Adds the text chunks in del_chunks to the document doc (another 
268      list of text chunks) with marker to show it is a delete. 
269      cleanup_delete later resolves these markers into <del> tags.""" 
270      doc.append(DEL_START) 
271      doc.extend(del_chunks) 
272      doc.append(DEL_END) 
 273   
275      """ Cleans up any DEL_START/DEL_END markers in the document, replacing 
276      them with <del></del>.  To do this while keeping the document 
277      valid, it may need to drop some tags (either start or end tags). 
278   
279      It may also move the del into adjacent tags to try to move it to a 
280      similar location where it was originally located (e.g., moving a 
281      delete into preceding <div> tag, if the del looks like (DEL_START, 
282      'Text</div>', DEL_END)""" 
283      while 1: 
284           
285           
286           
287          try: 
288              pre_delete, delete, post_delete = split_delete(chunks) 
289          except NoDeletes: 
290               
291              break 
292           
293           
294          unbalanced_start, balanced, unbalanced_end = split_unbalanced(delete) 
295           
296           
297          locate_unbalanced_start(unbalanced_start, pre_delete, post_delete) 
298          locate_unbalanced_end(unbalanced_end, pre_delete, post_delete) 
299          doc = pre_delete 
300          if doc and not doc[-1].endswith(' '): 
301               
302              doc[-1] += ' ' 
303          doc.append('<del>') 
304          if balanced and balanced[-1].endswith(' '): 
305               
306              balanced[-1] = balanced[-1][:-1] 
307          doc.extend(balanced) 
308          doc.append('</del> ') 
309          doc.extend(post_delete) 
310          chunks = doc 
311      return chunks 
 312   
314      """Return (unbalanced_start, balanced, unbalanced_end), where each is 
315      a list of text and tag chunks. 
316   
317      unbalanced_start is a list of all the tags that are opened, but 
318      not closed in this span.  Similarly, unbalanced_end is a list of 
319      tags that are closed but were not opened.  Extracting these might 
320      mean some reordering of the chunks.""" 
321      start = [] 
322      end = [] 
323      tag_stack = [] 
324      balanced = [] 
325      for chunk in chunks: 
326          if not chunk.startswith('<'): 
327              balanced.append(chunk) 
328              continue 
329          endtag = chunk[1] == '/' 
330          name = chunk.split()[0].strip('<>/') 
331          if name in empty_tags: 
332              balanced.append(chunk) 
333              continue 
334          if endtag: 
335              if tag_stack and tag_stack[-1][0] == name: 
336                  balanced.append(chunk) 
337                  name, pos, tag = tag_stack.pop() 
338                  balanced[pos] = tag 
339              elif tag_stack: 
340                  start.extend([tag for name, pos, tag in tag_stack]) 
341                  tag_stack = [] 
342                  end.append(chunk) 
343              else: 
344                  end.append(chunk) 
345          else: 
346              tag_stack.append((name, len(balanced), chunk)) 
347              balanced.append(None) 
348      start.extend( 
349          [chunk for name, pos, chunk in tag_stack]) 
350      balanced = [chunk for chunk in balanced if chunk is not None] 
351      return start, balanced, end 
 352   
354      """ Returns (stuff_before_DEL_START, stuff_inside_DEL_START_END, 
355      stuff_after_DEL_END).  Returns the first case found (there may be 
356      more DEL_STARTs in stuff_after_DEL_END).  Raises NoDeletes if 
357      there's no DEL_START found. """ 
358      try: 
359          pos = chunks.index(DEL_START) 
360      except ValueError: 
361          raise NoDeletes 
362      pos2 = chunks.index(DEL_END) 
363      return chunks[:pos], chunks[pos+1:pos2], chunks[pos2+1:] 
 364   
366      """ pre_delete and post_delete implicitly point to a place in the 
367      document (where the two were split).  This moves that point (by 
368      popping items from one and pushing them onto the other).  It moves 
369      the point to try to find a place where unbalanced_start applies. 
370   
371      As an example:: 
372   
373          >>> unbalanced_start = ['<div>'] 
374          >>> doc = ['<p>', 'Text', '</p>', '<div>', 'More Text', '</div>'] 
375          >>> pre, post = doc[:3], doc[3:] 
376          >>> pre, post 
377          (['<p>', 'Text', '</p>'], ['<div>', 'More Text', '</div>']) 
378          >>> locate_unbalanced_start(unbalanced_start, pre, post) 
379          >>> pre, post 
380          (['<p>', 'Text', '</p>', '<div>'], ['More Text', '</div>']) 
381   
382      As you can see, we moved the point so that the dangling <div> that 
383      we found will be effectively replaced by the div in the original 
384      document.  If this doesn't work out, we just throw away 
385      unbalanced_start without doing anything. 
386      """ 
387      while 1: 
388          if not unbalanced_start: 
389               
390              break 
391          finding = unbalanced_start[0] 
392          finding_name = finding.split()[0].strip('<>') 
393          if not post_delete: 
394              break 
395          next = post_delete[0] 
396          if next is DEL_START or not next.startswith('<'): 
397               
398              break 
399          if next[1] == '/': 
400               
401              break 
402          name = next.split()[0].strip('<>') 
403          if name == 'ins': 
404               
405              break 
406          assert name != 'del', ( 
407              "Unexpected delete tag: %r" % next) 
408          if name == finding_name: 
409              unbalanced_start.pop(0) 
410              pre_delete.append(post_delete.pop(0)) 
411          else: 
412               
413              break 
 414   
416      """ like locate_unbalanced_start, except handling end tags and 
417      possibly moving the point earlier in the document.  """ 
418      while 1: 
419          if not unbalanced_end: 
420               
421              break 
422          finding = unbalanced_end[-1] 
423          finding_name = finding.split()[0].strip('<>/') 
424          if not pre_delete: 
425              break 
426          next = pre_delete[-1] 
427          if next is DEL_END or not next.startswith('</'): 
428               
429              break 
430          name = next.split()[0].strip('<>/') 
431          if name == 'ins' or name == 'del': 
432               
433              break 
434          if name == finding_name: 
435              unbalanced_end.pop() 
436              post_delete.insert(0, pre_delete.pop()) 
437          else: 
438               
439              break 
 440   
442      """ Represents a diffable token, generally a word that is displayed to 
443      the user.  Opening tags are attached to this token when they are 
444      adjacent (pre_tags) and closing tags that follow the word 
445      (post_tags).  Some exceptions occur when there are empty tags 
446      adjacent to a word, so there may be close tags in pre_tags, or 
447      open tags in post_tags. 
448   
449      We also keep track of whether the word was originally followed by 
450      whitespace, even though we do not want to treat the word as 
451      equivalent to a similar word that does not have a trailing 
452      space.""" 
453   
454       
455       
456      hide_when_equal = False 
457   
458 -    def __new__(cls, text, pre_tags=None, post_tags=None, trailing_whitespace=""): 
 459          obj = _unicode.__new__(cls, text) 
460   
461          if pre_tags is not None: 
462              obj.pre_tags = pre_tags 
463          else: 
464              obj.pre_tags = [] 
465   
466          if post_tags is not None: 
467              obj.post_tags = post_tags 
468          else: 
469              obj.post_tags = [] 
470   
471          obj.trailing_whitespace = trailing_whitespace 
472   
473          return obj 
 474   
476          return 'token(%s, %r, %r, %r)' % (_unicode.__repr__(self), self.pre_tags, 
477                                            self.post_tags, self.trailing_whitespace) 
 478   
480          return _unicode(self) 
  481   
483   
484      """ Represents a token that is actually a tag.  Currently this is just 
485      the <img> tag, which takes up visible space just like a word but 
486      is only represented in a document by a tag.  """ 
487   
488 -    def __new__(cls, tag, data, html_repr, pre_tags=None,  
489                  post_tags=None, trailing_whitespace=""): 
 490          obj = token.__new__(cls, "%s: %s" % (type, data),  
491                              pre_tags=pre_tags,  
492                              post_tags=post_tags,  
493                              trailing_whitespace=trailing_whitespace) 
494          obj.tag = tag 
495          obj.data = data 
496          obj.html_repr = html_repr 
497          return obj 
 498   
500          return 'tag_token(%s, %s, html_repr=%s, post_tags=%r, pre_tags=%r, trailing_whitespace=%r)' % ( 
501              self.tag,  
502              self.data,  
503              self.html_repr,  
504              self.pre_tags,  
505              self.post_tags,  
506              self.trailing_whitespace) 
 508          return self.html_repr 
  509   
511   
512      """ Represents the href in an anchor tag.  Unlike other words, we only 
513      show the href when it changes.  """ 
514   
515      hide_when_equal = True 
516   
518          return ' Link: %s' % self 
  519   
521      """ 
522      Parse the given HTML and returns token objects (words with attached tags). 
523   
524      This parses only the content of a page; anything in the head is 
525      ignored, and the <head> and <body> elements are themselves 
526      optional.  The content is then parsed by lxml, which ensures the 
527      validity of the resulting parsed document (though lxml may make 
528      incorrect guesses when the markup is particular bad). 
529   
530      <ins> and <del> tags are also eliminated from the document, as 
531      that gets confusing. 
532   
533      If include_hrefs is true, then the href attribute of <a> tags is 
534      included as a special kind of diffable token.""" 
535      if etree.iselement(html): 
536          body_el = html 
537      else: 
538          body_el = parse_html(html, cleanup=True) 
539       
540      chunks = flatten_el(body_el, skip_tag=True, include_hrefs=include_hrefs) 
541       
542      return fixup_chunks(chunks) 
 543   
545      """ 
546      Parses an HTML fragment, returning an lxml element.  Note that the HTML will be 
547      wrapped in a <div> tag that was not in the original document. 
548   
549      If cleanup is true, make sure there's no <head> or <body>, and get 
550      rid of any <ins> and <del> tags. 
551      """ 
552      if cleanup: 
553           
554          html = cleanup_html(html) 
555      return fragment_fromstring(html, create_parent=True) 
 556   
557  _body_re = re.compile(r'<body.*?>', re.I|re.S) 
558  _end_body_re = re.compile(r'</body.*?>', re.I|re.S) 
559  _ins_del_re = re.compile(r'</?(ins|del).*?>', re.I|re.S) 
560   
562      """ This 'cleans' the HTML, meaning that any page structure is removed 
563      (only the contents of <body> are used, if there is any <body). 
564      Also <ins> and <del> tags are removed.  """ 
565      match = _body_re.search(html) 
566      if match: 
567          html = html[match.end():] 
568      match = _end_body_re.search(html) 
569      if match: 
570          html = html[:match.start()] 
571      html = _ins_del_re.sub('', html) 
572      return html 
 573       
574   
575  end_whitespace_re = re.compile(r'[ \t\n\r]$') 
576   
578      """ 
579      This function takes a word, such as 'test\n\n' and returns ('test','\n\n') 
580      """ 
581      stripped_length = len(word.rstrip()) 
582      return word[0:stripped_length], word[stripped_length:] 
 583   
584   
586      """ 
587      This function takes a list of chunks and produces a list of tokens. 
588      """ 
589      tag_accum = [] 
590      cur_word = None 
591      result = [] 
592      for chunk in chunks: 
593          if isinstance(chunk, tuple): 
594              if chunk[0] == 'img': 
595                  src = chunk[1] 
596                  tag, trailing_whitespace = split_trailing_whitespace(chunk[2]) 
597                  cur_word = tag_token('img', src, html_repr=tag, 
598                                       pre_tags=tag_accum, 
599                                       trailing_whitespace=trailing_whitespace) 
600                  tag_accum = [] 
601                  result.append(cur_word) 
602   
603              elif chunk[0] == 'href': 
604                  href = chunk[1] 
605                  cur_word = href_token(href, pre_tags=tag_accum, trailing_whitespace=" ") 
606                  tag_accum = [] 
607                  result.append(cur_word) 
608              continue 
609   
610          if is_word(chunk): 
611              chunk, trailing_whitespace = split_trailing_whitespace(chunk) 
612              cur_word = token(chunk, pre_tags=tag_accum, trailing_whitespace=trailing_whitespace) 
613              tag_accum = [] 
614              result.append(cur_word) 
615   
616          elif is_start_tag(chunk): 
617              tag_accum.append(chunk) 
618   
619          elif is_end_tag(chunk): 
620              if tag_accum: 
621                  tag_accum.append(chunk) 
622              else: 
623                  assert cur_word, ( 
624                      "Weird state, cur_word=%r, result=%r, chunks=%r of %r" 
625                      % (cur_word, result, chunk, chunks)) 
626                  cur_word.post_tags.append(chunk) 
627          else: 
628              assert False 
629   
630      if not result: 
631          return [token('', pre_tags=tag_accum)] 
632      else: 
633          result[-1].post_tags.extend(tag_accum) 
634   
635      return result 
 636   
637   
638   
639  empty_tags = ( 
640      'param', 'img', 'area', 'br', 'basefont', 'input', 
641      'base', 'meta', 'link', 'col') 
642   
643  block_level_tags = ( 
644      'address', 
645      'blockquote', 
646      'center', 
647      'dir', 
648      'div', 
649      'dl', 
650      'fieldset', 
651      'form', 
652      'h1', 
653      'h2', 
654      'h3', 
655      'h4', 
656      'h5', 
657      'h6', 
658      'hr', 
659      'isindex', 
660      'menu', 
661      'noframes', 
662      'noscript', 
663      'ol', 
664      'p', 
665      'pre', 
666      'table', 
667      'ul', 
668      ) 
669   
670  block_level_container_tags = ( 
671      'dd', 
672      'dt', 
673      'frameset', 
674      'li', 
675      'tbody', 
676      'td', 
677      'tfoot', 
678      'th', 
679      'thead', 
680      'tr', 
681      ) 
682   
683   
684 -def flatten_el(el, include_hrefs, skip_tag=False): 
 685      """ Takes an lxml element el, and generates all the text chunks for 
686      that tag.  Each start tag is a chunk, each word is a chunk, and each 
687      end tag is a chunk. 
688   
689      If skip_tag is true, then the outermost container tag is 
690      not returned (just its contents).""" 
691      if not skip_tag: 
692          if el.tag == 'img': 
693              yield ('img', el.get('src'), start_tag(el)) 
694          else: 
695              yield start_tag(el) 
696      if el.tag in empty_tags and not el.text and not len(el) and not el.tail: 
697          return 
698      start_words = split_words(el.text) 
699      for word in start_words: 
700          yield html_escape(word) 
701      for child in el: 
702          for item in flatten_el(child, include_hrefs=include_hrefs): 
703              yield item 
704      if el.tag == 'a' and el.get('href') and include_hrefs: 
705          yield ('href', el.get('href')) 
706      if not skip_tag: 
707          yield end_tag(el) 
708          end_words = split_words(el.tail) 
709          for word in end_words: 
710              yield html_escape(word) 
 711   
712  split_words_re = re.compile(r'\S+(?:\s+|$)', re.U) 
713   
715      """ Splits some text into words. Includes trailing whitespace 
716      on each word when appropriate.  """ 
717      if not text or not text.strip(): 
718          return [] 
719   
720      words = split_words_re.findall(text) 
721      return words 
 722   
723  start_whitespace_re = re.compile(r'^[ \t\n\r]') 
724   
726      """ 
727      The text representation of the start tag for a tag. 
728      """ 
729      return '<%s%s>' % ( 
730          el.tag, ''.join([' %s="%s"' % (name, html_escape(value, True)) 
731                           for name, value in el.attrib.items()])) 
 732   
734      """ The text representation of an end tag for a tag.  Includes 
735      trailing whitespace when appropriate.  """ 
736      if el.tail and start_whitespace_re.search(el.tail): 
737          extra = ' ' 
738      else: 
739          extra = '' 
740      return '</%s>%s' % (el.tag, extra) 
 741   
743      return not tok.startswith('<') 
 744   
746      return tok.startswith('</') 
 747   
749      return tok.startswith('<') and not tok.startswith('</') 
 750   
759   
761      """ Serialize a single lxml element as HTML.  The serialized form 
762      includes the elements tail.   
763   
764      If skip_outer is true, then don't serialize the outermost tag 
765      """ 
766      assert not isinstance(el, basestring), ( 
767          "You should pass in an element, not a string like %r" % el) 
768      html = etree.tostring(el, method="html", encoding=_unicode) 
769      if skip_outer: 
770           
771          html = html[html.find('>')+1:] 
772           
773          html = html[:html.rfind('<')] 
774          return html.strip() 
775      else: 
776          return html 
 777   
787               
788   
798   
830               
832      """ 
833      Removes an element, but merges its contents into its place, e.g., 
834      given <p>Hi <i>there!</i></p>, if you remove the <i> element you get 
835      <p>Hi there!</p> 
836      """ 
837      parent = el.getparent() 
838      text = el.text or '' 
839      if el.tail: 
840          if not len(el): 
841              text += el.tail 
842          else: 
843              if el[-1].tail: 
844                  el[-1].tail += el.tail 
845              else: 
846                  el[-1].tail = el.tail 
847      index = parent.index(el) 
848      if text: 
849          if index == 0: 
850              previous = None 
851          else: 
852              previous = parent[index-1] 
853          if previous is None: 
854              if parent.text: 
855                  parent.text += text 
856              else: 
857                  parent.text = text 
858          else: 
859              if previous.tail: 
860                  previous.tail += text 
861              else: 
862                  previous.tail = text 
863      parent[index:index+1] = el.getchildren() 
 864   
866      """ 
867      Acts like SequenceMatcher, but tries not to find very small equal 
868      blocks amidst large spans of changes 
869      """ 
870   
871      threshold = 2 
872       
874          size = min(len(self.b), len(self.b)) 
875          threshold = min(self.threshold, size / 4) 
876          actual = difflib.SequenceMatcher.get_matching_blocks(self) 
877          return [item for item in actual 
878                  if item[2] > threshold 
879                  or not item[2]] 
  880   
881  if __name__ == '__main__': 
882      from lxml.html import _diffcommand 
883      _diffcommand.main() 
884