1  """ 
  2  lxml-based doctest output comparison. 
  3   
  4  Note: normally, you should just import the `lxml.usedoctest` and 
  5  `lxml.html.usedoctest` modules from within a doctest, instead of this 
  6  one:: 
  7   
  8      >>> import lxml.usedoctest # for XML output 
  9   
 10      >>> import lxml.html.usedoctest # for HTML output 
 11   
 12  To use this module directly, you must call ``lxmldoctest.install()``, 
 13  which will cause doctest to use this in all subsequent calls. 
 14   
 15  This changes the way output is checked and comparisons are made for 
 16  XML or HTML-like content. 
 17   
 18  XML or HTML content is noticed because the example starts with ``<`` 
 19  (it's HTML if it starts with ``<html``).  You can also use the 
 20  ``PARSE_HTML`` and ``PARSE_XML`` flags to force parsing. 
 21   
 22  Some rough wildcard-like things are allowed.  Whitespace is generally 
 23  ignored (except in attributes).  In text (attributes and text in the 
 24  body) you can use ``...`` as a wildcard.  In an example it also 
 25  matches any trailing tags in the element, though it does not match 
 26  leading tags.  You may create a tag ``<any>`` or include an ``any`` 
 27  attribute in the tag.  An ``any`` tag matches any tag, while the 
 28  attribute matches any and all attributes. 
 29   
 30  When a match fails, the reformatted example and gotten text is 
 31  displayed (indented), and a rough diff-like output is given.  Anything 
 32  marked with ``+`` is in the output but wasn't supposed to be, and 
 33  similarly ``-`` means its in the example but wasn't in the output. 
 34   
 35  You can disable parsing on one line with ``# doctest:+NOPARSE_MARKUP`` 
 36  """ 
 37   
 38  from lxml import etree 
 39  import sys 
 40  import re 
 41  import doctest 
 42  try: 
 43      from html import escape as html_escape 
 44  except ImportError: 
 45      from cgi import escape as html_escape 
 46   
 47  __all__ = ['PARSE_HTML', 'PARSE_XML', 'NOPARSE_MARKUP', 'LXMLOutputChecker', 
 48             'LHTMLOutputChecker', 'install', 'temp_install'] 
 49   
 50  try: 
 51      _basestring = basestring 
 52  except NameError: 
 53      _basestring = (str, bytes) 
 54   
 55  _IS_PYTHON_3 = sys.version_info[0] >= 3 
 56   
 57  PARSE_HTML = doctest.register_optionflag('PARSE_HTML') 
 58  PARSE_XML = doctest.register_optionflag('PARSE_XML') 
 59  NOPARSE_MARKUP = doctest.register_optionflag('NOPARSE_MARKUP') 
 60   
 61  OutputChecker = doctest.OutputChecker 
 62   
 64      if v is None: 
 65          return None 
 66      else: 
 67          return v.strip() 
  68   
 71   
 72  _html_parser = etree.HTMLParser(recover=False, remove_blank_text=True) 
 73   
 76   
 77   
 78  _repr_re = re.compile(r'^<[^>]+ (at|object) ') 
 79  _norm_whitespace_re = re.compile(r'[ \t\n][ \t\n]+') 
 80   
 82   
 83      empty_tags = ( 
 84          'param', 'img', 'area', 'br', 'basefont', 'input', 
 85          'base', 'meta', 'link', 'col') 
 86   
 89   
 91          alt_self = getattr(self, '_temp_override_self', None) 
 92          if alt_self is not None: 
 93              super_method = self._temp_call_super_check_output 
 94              self = alt_self 
 95          else: 
 96              super_method = OutputChecker.check_output 
 97          parser = self.get_parser(want, got, optionflags) 
 98          if not parser: 
 99              return super_method( 
100                  self, want, got, optionflags) 
101          try: 
102              want_doc = parser(want) 
103          except etree.XMLSyntaxError: 
104              return False 
105          try: 
106              got_doc = parser(got) 
107          except etree.XMLSyntaxError: 
108              return False 
109          return self.compare_docs(want_doc, got_doc) 
 110   
126   
128          s = s.strip() 
129          return (s.startswith('<') 
130                  and not _repr_re.search(s)) 
 131   
133          if not self.tag_compare(want.tag, got.tag): 
134              return False 
135          if not self.text_compare(want.text, got.text, True): 
136              return False 
137          if not self.text_compare(want.tail, got.tail, True): 
138              return False 
139          if 'any' not in want.attrib: 
140              want_keys = sorted(want.attrib.keys()) 
141              got_keys = sorted(got.attrib.keys()) 
142              if want_keys != got_keys: 
143                  return False 
144              for key in want_keys: 
145                  if not self.text_compare(want.attrib[key], got.attrib[key], False): 
146                      return False 
147          if want.text != '...' or len(want): 
148              want_children = list(want) 
149              got_children = list(got) 
150              while want_children or got_children: 
151                  if not want_children or not got_children: 
152                      return False 
153                  want_first = want_children.pop(0) 
154                  got_first = got_children.pop(0) 
155                  if not self.compare_docs(want_first, got_first): 
156                      return False 
157                  if not got_children and want_first.tail == '...': 
158                      break 
159          return True 
 160   
161 -    def text_compare(self, want, got, strip): 
 162          want = want or '' 
163          got = got or '' 
164          if strip: 
165              want = norm_whitespace(want).strip() 
166              got = norm_whitespace(got).strip() 
167          want = '^%s$' % re.escape(want) 
168          want = want.replace(r'\.\.\.', '.*') 
169          if re.search(want, got): 
170              return True 
171          else: 
172              return False 
 173   
175          if want == 'any': 
176              return True 
177          if (not isinstance(want, _basestring) 
178              or not isinstance(got, _basestring)): 
179              return want == got 
180          want = want or '' 
181          got = got or '' 
182          if want.startswith('{...}'): 
183               
184              return want.split('}')[-1] == got.split('}')[-1] 
185          else: 
186              return want == got 
 187   
189          want = example.want 
190          parser = self.get_parser(want, got, optionflags) 
191          errors = [] 
192          if parser is not None: 
193              try: 
194                  want_doc = parser(want) 
195              except etree.XMLSyntaxError: 
196                  e = sys.exc_info()[1] 
197                  errors.append('In example: %s' % e) 
198              try: 
199                  got_doc = parser(got) 
200              except etree.XMLSyntaxError: 
201                  e = sys.exc_info()[1] 
202                  errors.append('In actual output: %s' % e) 
203          if parser is None or errors: 
204              value = OutputChecker.output_difference( 
205                  self, example, got, optionflags) 
206              if errors: 
207                  errors.append(value) 
208                  return '\n'.join(errors) 
209              else: 
210                  return value 
211          html = parser is html_fromstring 
212          diff_parts = ['Expected:', 
213                        self.format_doc(want_doc, html, 2), 
214                        'Got:', 
215                        self.format_doc(got_doc, html, 2), 
216                        'Diff:', 
217                        self.collect_diff(want_doc, got_doc, html, 2)] 
218          return '\n'.join(diff_parts) 
 219   
221          if not html: 
222              return False 
223          if el.tag not in self.empty_tags: 
224              return False 
225          if el.text or len(el): 
226               
227              return False 
228          return True 
 229   
264   
271   
282       
288   
290          parts = [] 
291          if not len(want) and not len(got): 
292              parts.append(' '*indent) 
293              parts.append(self.collect_diff_tag(want, got)) 
294              if not self.html_empty_tag(got, html): 
295                  parts.append(self.collect_diff_text(want.text, got.text)) 
296                  parts.append(self.collect_diff_end_tag(want, got)) 
297              parts.append(self.collect_diff_text(want.tail, got.tail)) 
298              parts.append('\n') 
299              return ''.join(parts) 
300          parts.append(' '*indent) 
301          parts.append(self.collect_diff_tag(want, got)) 
302          parts.append('\n') 
303          if strip(want.text) or strip(got.text): 
304              parts.append(' '*indent) 
305              parts.append(self.collect_diff_text(want.text, got.text)) 
306              parts.append('\n') 
307          want_children = list(want) 
308          got_children = list(got) 
309          while want_children or got_children: 
310              if not want_children: 
311                  parts.append(self.format_doc(got_children.pop(0), html, indent+2, '+')) 
312                  continue 
313              if not got_children: 
314                  parts.append(self.format_doc(want_children.pop(0), html, indent+2, '-')) 
315                  continue 
316              parts.append(self.collect_diff( 
317                  want_children.pop(0), got_children.pop(0), html, indent+2)) 
318          parts.append(' '*indent) 
319          parts.append(self.collect_diff_end_tag(want, got)) 
320          parts.append('\n') 
321          if strip(want.tail) or strip(got.tail): 
322              parts.append(' '*indent) 
323              parts.append(self.collect_diff_text(want.tail, got.tail)) 
324              parts.append('\n') 
325          return ''.join(parts) 
 326   
328          if not self.tag_compare(want.tag, got.tag): 
329              tag = '%s (got: %s)' % (want.tag, got.tag) 
330          else: 
331              tag = got.tag 
332          attrs = [] 
333          any = want.tag == 'any' or 'any' in want.attrib 
334          for name, value in sorted(got.attrib.items()): 
335              if name not in want.attrib and not any: 
336                  attrs.append('+%s="%s"' % (name, self.format_text(value, False))) 
337              else: 
338                  if name in want.attrib: 
339                      text = self.collect_diff_text(want.attrib[name], value, False) 
340                  else: 
341                      text = self.format_text(value, False) 
342                  attrs.append('%s="%s"' % (name, text)) 
343          if not any: 
344              for name, value in sorted(want.attrib.items()): 
345                  if name in got.attrib: 
346                      continue 
347                  attrs.append('-%s="%s"' % (name, self.format_text(value, False))) 
348          if attrs: 
349              tag = '<%s %s>' % (tag, ' '.join(attrs)) 
350          else: 
351              tag = '<%s>' % tag 
352          return tag 
 353   
355          if want.tag != got.tag: 
356              tag = '%s (got: %s)' % (want.tag, got.tag) 
357          else: 
358              tag = got.tag 
359          return '</%s>' % tag 
 360   
361 -    def collect_diff_text(self, want, got, strip=True): 
 362          if self.text_compare(want, got, strip): 
363              if not got: 
364                  return '' 
365              return self.format_text(got, strip) 
366          text = '%s (got: %s)' % (want, got) 
367          return self.format_text(text, strip) 
  368   
372       
374      """ 
375      Install doctestcompare for all future doctests. 
376   
377      If html is true, then by default the HTML parser will be used; 
378      otherwise the XML parser is used. 
379      """ 
380      if html: 
381          doctest.OutputChecker = LHTMLOutputChecker 
382      else: 
383          doctest.OutputChecker = LXMLOutputChecker 
 384   
386      """ 
387      Use this *inside* a doctest to enable this checker for this 
388      doctest only. 
389   
390      If html is true, then by default the HTML parser will be used; 
391      otherwise the XML parser is used. 
392      """ 
393      if html: 
394          Checker = LHTMLOutputChecker 
395      else: 
396          Checker = LXMLOutputChecker 
397      frame = _find_doctest_frame() 
398      dt_self = frame.f_locals['self'] 
399      checker = Checker() 
400      old_checker = dt_self._checker 
401      dt_self._checker = checker 
402       
403       
404       
405       
406       
407       
408       
409       
410       
411      if _IS_PYTHON_3: 
412          check_func = frame.f_locals['check'].__func__ 
413          checker_check_func = checker.check_output.__func__ 
414      else: 
415          check_func = frame.f_locals['check'].im_func 
416          checker_check_func = checker.check_output.im_func 
417       
418       
419      doctest.etree = etree 
420      _RestoreChecker(dt_self, old_checker, checker, 
421                      check_func, checker_check_func, 
422                      del_module) 
 423   
425 -    def __init__(self, dt_self, old_checker, new_checker, check_func, clone_func, 
426                   del_module): 
 427          self.dt_self = dt_self 
428          self.checker = old_checker 
429          self.checker._temp_call_super_check_output = self.call_super 
430          self.checker._temp_override_self = new_checker 
431          self.check_func = check_func 
432          self.clone_func = clone_func 
433          self.del_module = del_module 
434          self.install_clone() 
435          self.install_dt_self() 
 437          if _IS_PYTHON_3: 
438              self.func_code = self.check_func.__code__ 
439              self.func_globals = self.check_func.__globals__ 
440              self.check_func.__code__ = self.clone_func.__code__ 
441          else: 
442              self.func_code = self.check_func.func_code 
443              self.func_globals = self.check_func.func_globals 
444              self.check_func.func_code = self.clone_func.func_code 
 446          if _IS_PYTHON_3: 
447              self.check_func.__code__ = self.func_code 
448          else: 
449              self.check_func.func_code = self.func_code 
 451          self.prev_func = self.dt_self._DocTestRunner__record_outcome 
452          self.dt_self._DocTestRunner__record_outcome = self 
 454          self.dt_self._DocTestRunner__record_outcome = self.prev_func 
 456          if self.del_module: 
457              import sys 
458              del sys.modules[self.del_module] 
459              if '.' in self.del_module: 
460                  package, module = self.del_module.rsplit('.', 1) 
461                  package_mod = sys.modules[package] 
462                  delattr(package_mod, module) 
 479      import sys 
480      frame = sys._getframe(1) 
481      while frame: 
482          l = frame.f_locals 
483          if 'BOOM' in l: 
484               
485              return frame 
486          frame = frame.f_back 
487      raise LookupError( 
488          "Could not find doctest (only use this function *inside* a doctest)") 
 489       
490  __test__ = { 
491      'basic': ''' 
492      >>> temp_install() 
493      >>> print """<xml a="1" b="2">stuff</xml>""" 
494      <xml b="2" a="1">...</xml> 
495      >>> print """<xml xmlns="http://example.com"><tag   attr="bar"   /></xml>""" 
496      <xml xmlns="..."> 
497        <tag attr="..." /> 
498      </xml> 
499      >>> print """<xml>blahblahblah<foo /></xml>""" # doctest: +NOPARSE_MARKUP, +ELLIPSIS 
500      <xml>...foo /></xml> 
501      '''} 
502   
503  if __name__ == '__main__': 
504      import doctest 
505      doctest.testmod() 
506