1   
   2   
   3   
   4   
   5   
   6   
   7   
   8   
   9   
  10   
  11   
  12   
  13   
  14   
  15   
  16   
  17   
  18   
  19   
  20   
  21   
  22   
  23   
  24   
  25   
  26   
  27   
  28   
  29   
  30   
  31  """The ``lxml.html`` tool set for HTML handling. 
  32  """ 
  33   
  34  from __future__ import absolute_import 
  35   
  36  __all__ = [ 
  37      'document_fromstring', 'fragment_fromstring', 'fragments_fromstring', 'fromstring', 
  38      'tostring', 'Element', 'defs', 'open_in_browser', 'submit_form', 
  39      'find_rel_links', 'find_class', 'make_links_absolute', 
  40      'resolve_base_href', 'iterlinks', 'rewrite_links', 'open_in_browser', 'parse'] 
  41   
  42   
  43  import copy 
  44  import sys 
  45  import re 
  46  from functools import partial 
  47   
  48  try: 
  49      from collections.abc import MutableMapping, MutableSet 
  50  except ImportError: 
  51      from collections import MutableMapping, MutableSet 
  52   
  53  from .. import etree 
  54  from . import defs 
  55  from ._setmixin import SetMixin 
  56   
  57  try: 
  58      from urlparse import urljoin 
  59  except ImportError: 
  60       
  61      from urllib.parse import urljoin 
  62   
  63  try: 
  64      unicode 
  65  except NameError: 
  66       
  67      unicode = str 
  68  try: 
  69      basestring 
  70  except NameError: 
  71       
  72      basestring = (str, bytes) 
  76      if not s: 
  77          return s 
  78      if sys.version_info[0] >= 3: 
  79          sub = re.compile(r"^(\s*)u'", re.M).sub 
  80      else: 
  81          sub = re.compile(r"^(\s*)b'", re.M).sub 
  82      return sub(r"\1'", s) 
   83   
  84   
  85  XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml" 
  86   
  87  _rel_links_xpath = etree.XPath("descendant-or-self::a[@rel]|descendant-or-self::x:a[@rel]", 
  88                                 namespaces={'x':XHTML_NAMESPACE}) 
  89  _options_xpath = etree.XPath("descendant-or-self::option|descendant-or-self::x:option", 
  90                               namespaces={'x':XHTML_NAMESPACE}) 
  91  _forms_xpath = etree.XPath("descendant-or-self::form|descendant-or-self::x:form", 
  92                             namespaces={'x':XHTML_NAMESPACE}) 
  93   
  94  _class_xpath = etree.XPath("descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), concat(' ', $class_name, ' '))]") 
  95  _id_xpath = etree.XPath("descendant-or-self::*[@id=$id]") 
  96  _collect_string_content = etree.XPath("string()") 
  97  _iter_css_urls = re.compile(r'url\(('+'["][^"]*["]|'+"['][^']*[']|"+r'[^)]*)\)', re.I).finditer 
  98  _iter_css_imports = re.compile(r'@import "(.*?)"').finditer 
  99  _label_xpath = etree.XPath("//label[@for=$id]|//x:label[@for=$id]", 
 100                             namespaces={'x':XHTML_NAMESPACE}) 
 101  _archive_re = re.compile(r'[^ ]+') 
 102  _parse_meta_refresh_url = re.compile( 
 103      r'[^;=]*;\s*(?:url\s*=\s*)?(?P<url>.*)$', re.I).search 
 107      if s[:1] == '"' and s[-1:] == '"' or s[:1] == "'" and s[-1:] == "'": 
 108          return s[1:-1], pos+1 
 109      else: 
 110          return s,pos 
  111   
 122   
 129   
 132      """Provides access to an element's class attribute as a set-like collection. 
 133      Usage:: 
 134   
 135          >>> el = fromstring('<p class="hidden large">Text</p>') 
 136          >>> classes = el.classes  # or: classes = Classes(el.attrib) 
 137          >>> classes |= ['block', 'paragraph'] 
 138          >>> el.get('class') 
 139          'hidden large block paragraph' 
 140          >>> classes.toggle('hidden') 
 141          False 
 142          >>> el.get('class') 
 143          'large block paragraph' 
 144          >>> classes -= ('some', 'classes', 'block') 
 145          >>> el.get('class') 
 146          'large paragraph' 
 147      """ 
 149          self._attributes = attributes 
 150          self._get_class_value = partial(attributes.get, 'class', '') 
  151   
 152 -    def add(self, value): 
  153          """ 
 154          Add a class. 
 155   
 156          This has no effect if the class is already present. 
 157          """ 
 158          if not value or re.search(r'\s', value): 
 159              raise ValueError("Invalid class name: %r" % value) 
 160          classes = self._get_class_value().split() 
 161          if value in classes: 
 162              return 
 163          classes.append(value) 
 164          self._attributes['class'] = ' '.join(classes) 
  165   
 167          """ 
 168          Remove a class if it is currently present. 
 169   
 170          If the class is not present, do nothing. 
 171          """ 
 172          if not value or re.search(r'\s', value): 
 173              raise ValueError("Invalid class name: %r" % value) 
 174          classes = [name for name in self._get_class_value().split() 
 175                     if name != value] 
 176          if classes: 
 177              self._attributes['class'] = ' '.join(classes) 
 178          elif 'class' in self._attributes: 
 179              del self._attributes['class'] 
  180   
 182          """ 
 183          Remove a class; it must currently be present. 
 184   
 185          If the class is not present, raise a KeyError. 
 186          """ 
 187          if not value or re.search(r'\s', value): 
 188              raise ValueError("Invalid class name: %r" % value) 
 189          super(Classes, self).remove(value) 
  190   
 194   
 196          return iter(self._get_class_value().split()) 
  197   
 199          return len(self._get_class_value().split()) 
  200   
 201       
 202   
 204          """ 
 205          Add all names from 'values'. 
 206          """ 
 207          classes = self._get_class_value().split() 
 208          extended = False 
 209          for value in values: 
 210              if value not in classes: 
 211                  classes.append(value) 
 212                  extended = True 
 213          if extended: 
 214              self._attributes['class'] = ' '.join(classes) 
  215   
 217          """ 
 218          Add a class name if it isn't there yet, or remove it if it exists. 
 219   
 220          Returns true if the class was added (and is now enabled) and 
 221          false if it was removed (and is now disabled). 
 222          """ 
 223          if not value or re.search(r'\s', value): 
 224              raise ValueError("Invalid class name: %r" % value) 
 225          classes = self._get_class_value().split() 
 226          try: 
 227              classes.remove(value) 
 228              enabled = False 
 229          except ValueError: 
 230              classes.append(value) 
 231              enabled = True 
 232          if classes: 
 233              self._attributes['class'] = ' '.join(classes) 
 234          else: 
 235              del self._attributes['class'] 
 236          return enabled 
   237   
 240   
 241 -    def set(self, key, value=None): 
  242          """set(self, key, value=None) 
 243   
 244          Sets an element attribute.  If no value is provided, or if the value is None, 
 245          creates a 'boolean' attribute without value, e.g. "<form novalidate></form>" 
 246          for ``form.set('novalidate')``. 
 247          """ 
 248          super(HtmlElement, self).set(key, value) 
  249   
 250      @property 
 252          """ 
 253          A set-like wrapper around the 'class' attribute. 
 254          """ 
 255          return Classes(self.attrib) 
  256   
 257      @classes.setter 
 265   
 266      @property 
 268          """ 
 269          Returns the base URL, given when the page was parsed. 
 270   
 271          Use with ``urlparse.urljoin(el.base_url, href)`` to get 
 272          absolute URLs. 
 273          """ 
 274          return self.getroottree().docinfo.URL 
  275   
 276      @property 
 282   
 283      @property 
 285          """ 
 286          Return the <body> element.  Can be called from a child element 
 287          to get the document's head. 
 288          """ 
 289          return self.xpath('//body|//x:body', namespaces={'x':XHTML_NAMESPACE})[0] 
  290   
 291      @property 
 293          """ 
 294          Returns the <head> element.  Can be called from a child 
 295          element to get the document's head. 
 296          """ 
 297          return self.xpath('//head|//x:head', namespaces={'x':XHTML_NAMESPACE})[0] 
  298   
 299      @property 
 301          """ 
 302          Get or set any <label> element associated with this element. 
 303          """ 
 304          id = self.get('id') 
 305          if not id: 
 306              return None 
 307          result = _label_xpath(self, id=id) 
 308          if not result: 
 309              return None 
 310          else: 
 311              return result[0] 
  312   
 313      @label.setter 
 315          id = self.get('id') 
 316          if not id: 
 317              raise TypeError( 
 318                  "You cannot set a label for an element (%r) that has no id" 
 319                  % self) 
 320          if _nons(label.tag) != 'label': 
 321              raise TypeError( 
 322                  "You can only assign label to a label element (not %r)" 
 323                  % label) 
 324          label.set('for', id) 
  325   
 326      @label.deleter 
 331   
 333          """ 
 334          Removes this element from the tree, including its children and 
 335          text.  The tail text is joined to the previous element or 
 336          parent. 
 337          """ 
 338          parent = self.getparent() 
 339          assert parent is not None 
 340          if self.tail: 
 341              previous = self.getprevious() 
 342              if previous is None: 
 343                  parent.text = (parent.text or '') + self.tail 
 344              else: 
 345                  previous.tail = (previous.tail or '') + self.tail 
 346          parent.remove(self) 
  347   
 349          """ 
 350          Remove the tag, but not its children or text.  The children and text 
 351          are merged into the parent. 
 352   
 353          Example:: 
 354   
 355              >>> h = fragment_fromstring('<div>Hello <b>World!</b></div>') 
 356              >>> h.find('.//b').drop_tag() 
 357              >>> print(tostring(h, encoding='unicode')) 
 358              <div>Hello World!</div> 
 359          """ 
 360          parent = self.getparent() 
 361          assert parent is not None 
 362          previous = self.getprevious() 
 363          if self.text and isinstance(self.tag, basestring): 
 364               
 365              if previous is None: 
 366                  parent.text = (parent.text or '') + self.text 
 367              else: 
 368                  previous.tail = (previous.tail or '') + self.text 
 369          if self.tail: 
 370              if len(self): 
 371                  last = self[-1] 
 372                  last.tail = (last.tail or '') + self.tail 
 373              elif previous is None: 
 374                  parent.text = (parent.text or '') + self.tail 
 375              else: 
 376                  previous.tail = (previous.tail or '') + self.tail 
 377          index = parent.index(self) 
 378          parent[index:index+1] = self[:] 
  379   
 381          """ 
 382          Find any links like ``<a rel="{rel}">...</a>``; returns a list of elements. 
 383          """ 
 384          rel = rel.lower() 
 385          return [el for el in _rel_links_xpath(self) 
 386                  if el.get('rel').lower() == rel] 
  387   
 389          """ 
 390          Find any elements with the given class name. 
 391          """ 
 392          return _class_xpath(self, class_name=class_name) 
  393   
 395          """ 
 396          Get the first element in a document with the given id.  If none is 
 397          found, return the default argument if provided or raise KeyError 
 398          otherwise. 
 399   
 400          Note that there can be more than one element with the same id, 
 401          and this isn't uncommon in HTML documents found in the wild. 
 402          Browsers return only the first match, and this function does 
 403          the same. 
 404          """ 
 405          try: 
 406               
 407               
 408              return _id_xpath(self, id=id)[0] 
 409          except IndexError: 
 410              if default: 
 411                  return default[0] 
 412              else: 
 413                  raise KeyError(id) 
  414   
 415 -    def text_content(self): 
  416          """ 
 417          Return the text content of the tag (and the text in any children). 
 418          """ 
 419          return _collect_string_content(self) 
  420   
 421 -    def cssselect(self, expr, translator='html'): 
  422          """ 
 423          Run the CSS expression on this element and its children, 
 424          returning a list of the results. 
 425   
 426          Equivalent to lxml.cssselect.CSSSelect(expr, translator='html')(self) 
 427          -- note that pre-compiling the expression can provide a substantial 
 428          speedup. 
 429          """ 
 430           
 431          from lxml.cssselect import CSSSelector 
 432          return CSSSelector(expr, translator=translator)(self) 
  433   
 434       
 435       
 436       
 437   
 438 -    def make_links_absolute(self, base_url=None, resolve_base_href=True, 
 439                              handle_failures=None): 
  440          """ 
 441          Make all links in the document absolute, given the 
 442          ``base_url`` for the document (the full URL where the document 
 443          came from), or if no ``base_url`` is given, then the ``.base_url`` 
 444          of the document. 
 445   
 446          If ``resolve_base_href`` is true, then any ``<base href>`` 
 447          tags in the document are used *and* removed from the document. 
 448          If it is false then any such tag is ignored. 
 449   
 450          If ``handle_failures`` is None (default), a failure to process 
 451          a URL will abort the processing.  If set to 'ignore', errors 
 452          are ignored.  If set to 'discard', failing URLs will be removed. 
 453          """ 
 454          if base_url is None: 
 455              base_url = self.base_url 
 456              if base_url is None: 
 457                  raise TypeError( 
 458                      "No base_url given, and the document has no base_url") 
 459          if resolve_base_href: 
 460              self.resolve_base_href() 
 461   
 462          if handle_failures == 'ignore': 
 463              def link_repl(href): 
 464                  try: 
 465                      return urljoin(base_url, href) 
 466                  except ValueError: 
 467                      return href 
  468          elif handle_failures == 'discard': 
 469              def link_repl(href): 
 470                  try: 
 471                      return urljoin(base_url, href) 
 472                  except ValueError: 
 473                      return None 
  474          elif handle_failures is None: 
 475              def link_repl(href): 
 476                  return urljoin(base_url, href) 
 477          else: 
 478              raise ValueError( 
 479                  "unexpected value for handle_failures: %r" % handle_failures) 
 480   
 481          self.rewrite_links(link_repl) 
 482   
 484          """ 
 485          Find any ``<base href>`` tag in the document, and apply its 
 486          values to all links found in the document.  Also remove the 
 487          tag once it has been applied. 
 488   
 489          If ``handle_failures`` is None (default), a failure to process 
 490          a URL will abort the processing.  If set to 'ignore', errors 
 491          are ignored.  If set to 'discard', failing URLs will be removed. 
 492          """ 
 493          base_href = None 
 494          basetags = self.xpath('//base[@href]|//x:base[@href]', 
 495                                namespaces={'x': XHTML_NAMESPACE}) 
 496          for b in basetags: 
 497              base_href = b.get('href') 
 498              b.drop_tree() 
 499          if not base_href: 
 500              return 
 501          self.make_links_absolute(base_href, resolve_base_href=False, 
 502                                   handle_failures=handle_failures) 
  503   
 505          """ 
 506          Yield (element, attribute, link, pos), where attribute may be None 
 507          (indicating the link is in the text).  ``pos`` is the position 
 508          where the link occurs; often 0, but sometimes something else in 
 509          the case of links in stylesheets or style tags. 
 510   
 511          Note: <base href> is *not* taken into account in any way.  The 
 512          link you get is exactly the link in the document. 
 513   
 514          Note: multiple links inside of a single text string or 
 515          attribute value are returned in reversed order.  This makes it 
 516          possible to replace or delete them from the text string value 
 517          based on their reported text positions.  Otherwise, a 
 518          modification at one text position can change the positions of 
 519          links reported later on. 
 520          """ 
 521          link_attrs = defs.link_attrs 
 522          for el in self.iter(etree.Element): 
 523              attribs = el.attrib 
 524              tag = _nons(el.tag) 
 525              if tag == 'object': 
 526                  codebase = None 
 527                   
 528                   
 529                  if 'codebase' in attribs: 
 530                      codebase = el.get('codebase') 
 531                      yield (el, 'codebase', codebase, 0) 
 532                  for attrib in ('classid', 'data'): 
 533                      if attrib in attribs: 
 534                          value = el.get(attrib) 
 535                          if codebase is not None: 
 536                              value = urljoin(codebase, value) 
 537                          yield (el, attrib, value, 0) 
 538                  if 'archive' in attribs: 
 539                      for match in _archive_re.finditer(el.get('archive')): 
 540                          value = match.group(0) 
 541                          if codebase is not None: 
 542                              value = urljoin(codebase, value) 
 543                          yield (el, 'archive', value, match.start()) 
 544              else: 
 545                  for attrib in link_attrs: 
 546                      if attrib in attribs: 
 547                          yield (el, attrib, attribs[attrib], 0) 
 548              if tag == 'meta': 
 549                  http_equiv = attribs.get('http-equiv', '').lower() 
 550                  if http_equiv == 'refresh': 
 551                      content = attribs.get('content', '') 
 552                      match = _parse_meta_refresh_url(content) 
 553                      url = (match.group('url') if match else content).strip() 
 554                       
 555                       
 556                      if url: 
 557                          url, pos = _unquote_match( 
 558                              url, match.start('url') if match else content.find(url)) 
 559                          yield (el, 'content', url, pos) 
 560              elif tag == 'param': 
 561                  valuetype = el.get('valuetype') or '' 
 562                  if valuetype.lower() == 'ref': 
 563                       
 564                       
 565                       
 566                       
 567                       
 568                       
 569                      yield (el, 'value', el.get('value'), 0) 
 570              elif tag == 'style' and el.text: 
 571                  urls = [ 
 572                       
 573                      _unquote_match(match.group(1), match.start(1))[::-1] 
 574                      for match in _iter_css_urls(el.text) 
 575                      ] + [ 
 576                      (match.start(1), match.group(1)) 
 577                      for match in _iter_css_imports(el.text) 
 578                      ] 
 579                  if urls: 
 580                       
 581                       
 582                       
 583                      urls.sort(reverse=True) 
 584                      for start, url in urls: 
 585                          yield (el, None, url, start) 
 586              if 'style' in attribs: 
 587                  urls = list(_iter_css_urls(attribs['style'])) 
 588                  if urls: 
 589                       
 590                      for match in urls[::-1]: 
 591                          url, start = _unquote_match(match.group(1), match.start(1)) 
 592                          yield (el, 'style', url, start) 
  593   
 594 -    def rewrite_links(self, link_repl_func, resolve_base_href=True, 
 595                        base_href=None): 
  596          """ 
 597          Rewrite all the links in the document.  For each link 
 598          ``link_repl_func(link)`` will be called, and the return value 
 599          will replace the old link. 
 600   
 601          Note that links may not be absolute (unless you first called 
 602          ``make_links_absolute()``), and may be internal (e.g., 
 603          ``'#anchor'``).  They can also be values like 
 604          ``'mailto:email'`` or ``'javascript:expr'``. 
 605   
 606          If you give ``base_href`` then all links passed to 
 607          ``link_repl_func()`` will take that into account. 
 608   
 609          If the ``link_repl_func`` returns None, the attribute or 
 610          tag text will be removed completely. 
 611          """ 
 612          if base_href is not None: 
 613               
 614               
 615              self.make_links_absolute( 
 616                  base_href, resolve_base_href=resolve_base_href) 
 617          elif resolve_base_href: 
 618              self.resolve_base_href() 
 619   
 620          for el, attrib, link, pos in self.iterlinks(): 
 621              new_link = link_repl_func(link.strip()) 
 622              if new_link == link: 
 623                  continue 
 624              if new_link is None: 
 625                   
 626                  if attrib is None: 
 627                      el.text = '' 
 628                  else: 
 629                      del el.attrib[attrib] 
 630                  continue 
 631   
 632              if attrib is None: 
 633                  new = el.text[:pos] + new_link + el.text[pos+len(link):] 
 634                  el.text = new 
 635              else: 
 636                  cur = el.get(attrib) 
 637                  if not pos and len(cur) == len(link): 
 638                      new = new_link   
 639                  else: 
 640                      new = cur[:pos] + new_link + cur[pos+len(link):] 
 641                  el.set(attrib, new) 
  642   
 645      """ 
 646      An object that represents a method on an element as a function; 
 647      the function takes either an element or an HTML string.  It 
 648      returns whatever the function normally returns, or if the function 
 649      works in-place (and so returns None) it returns a serialized form 
 650      of the resulting document. 
 651      """ 
 657          result_type = type(doc) 
 658          if isinstance(doc, basestring): 
 659              if 'copy' in kw: 
 660                  raise TypeError( 
 661                      "The keyword 'copy' can only be used with element inputs to %s, not a string input" % self.name) 
 662              doc = fromstring(doc, **kw) 
 663          else: 
 664              if 'copy' in kw: 
 665                  make_a_copy = kw.pop('copy') 
 666              else: 
 667                  make_a_copy = self.copy 
 668              if make_a_copy: 
 669                  doc = copy.deepcopy(doc) 
 670          meth = getattr(doc, self.name) 
 671          result = meth(*args, **kw) 
 672           
 673          if result is None: 
 674               
 675              return _transform_result(result_type, doc) 
 676          else: 
 677              return result 
   678   
 679   
 680  find_rel_links = _MethodFunc('find_rel_links', copy=False) 
 681  find_class = _MethodFunc('find_class', copy=False) 
 682  make_links_absolute = _MethodFunc('make_links_absolute', copy=True) 
 683  resolve_base_href = _MethodFunc('resolve_base_href', copy=True) 
 684  iterlinks = _MethodFunc('iterlinks', copy=False) 
 685  rewrite_links = _MethodFunc('rewrite_links', copy=True) 
 690   
 696   
 700   
 701   
 702 -class HtmlEntity(etree.EntityBase, HtmlMixin): 
  704   
 707      """A lookup scheme for HTML Element classes. 
 708   
 709      To create a lookup instance with different Element classes, pass a tag 
 710      name mapping of Element classes in the ``classes`` keyword argument and/or 
 711      a tag name mapping of Mixin classes in the ``mixins`` keyword argument. 
 712      The special key '*' denotes a Mixin class that should be mixed into all 
 713      Element classes. 
 714      """ 
 715      _default_element_classes = {} 
 716   
 717 -    def __init__(self, classes=None, mixins=None): 
  734   
 735 -    def lookup(self, node_type, document, namespace, name): 
   746   
 747   
 748   
 749   
 750   
 751   
 752  _looks_like_full_html_unicode = re.compile( 
 753      unicode(r'^\s*<(?:html|!doctype)'), re.I).match 
 754  _looks_like_full_html_bytes = re.compile( 
 755      r'^\s*<(?:html|!doctype)'.encode('ascii'), re.I).match 
 770   
 774      """Parses several HTML elements, returning a list of elements. 
 775   
 776      The first item in the list may be a string. 
 777      If no_leading_text is true, then it will be an error if there is 
 778      leading text, and it will always be a list of only elements. 
 779   
 780      base_url will set the document's base_url attribute 
 781      (and the tree's docinfo.URL). 
 782      """ 
 783      if parser is None: 
 784          parser = html_parser 
 785       
 786      if isinstance(html, bytes): 
 787          if not _looks_like_full_html_bytes(html): 
 788               
 789              html = ('<html><body>'.encode('ascii') + html + 
 790                      '</body></html>'.encode('ascii')) 
 791      else: 
 792          if not _looks_like_full_html_unicode(html): 
 793              html = '<html><body>%s</body></html>' % html 
 794      doc = document_fromstring(html, parser=parser, base_url=base_url, **kw) 
 795      assert _nons(doc.tag) == 'html' 
 796      bodies = [e for e in doc if _nons(e.tag) == 'body'] 
 797      assert len(bodies) == 1, ("too many bodies: %r in %r" % (bodies, html)) 
 798      body = bodies[0] 
 799      elements = [] 
 800      if no_leading_text and body.text and body.text.strip(): 
 801          raise etree.ParserError( 
 802              "There is leading text: %r" % body.text) 
 803      if body.text and body.text.strip(): 
 804          elements.append(body.text) 
 805      elements.extend(body) 
 806       
 807       
 808      return elements 
  809   
 813      """ 
 814      Parses a single HTML element; it is an error if there is more than 
 815      one element, or if anything but whitespace precedes or follows the 
 816      element. 
 817   
 818      If ``create_parent`` is true (or is a tag name) then a parent node 
 819      will be created to encapsulate the HTML in a single element.  In this 
 820      case, leading or trailing text is also allowed, as are multiple elements 
 821      as result of the parsing. 
 822   
 823      Passing a ``base_url`` will set the document's ``base_url`` attribute 
 824      (and the tree's docinfo.URL). 
 825      """ 
 826      if parser is None: 
 827          parser = html_parser 
 828   
 829      accept_leading_text = bool(create_parent) 
 830   
 831      elements = fragments_fromstring( 
 832          html, parser=parser, no_leading_text=not accept_leading_text, 
 833          base_url=base_url, **kw) 
 834   
 835      if create_parent: 
 836          if not isinstance(create_parent, basestring): 
 837              create_parent = 'div' 
 838          new_root = Element(create_parent) 
 839          if elements: 
 840              if isinstance(elements[0], basestring): 
 841                  new_root.text = elements[0] 
 842                  del elements[0] 
 843              new_root.extend(elements) 
 844          return new_root 
 845   
 846      if not elements: 
 847          raise etree.ParserError('No elements found') 
 848      if len(elements) > 1: 
 849          raise etree.ParserError( 
 850              "Multiple elements found (%s)" 
 851              % ', '.join([_element_name(e) for e in elements])) 
 852      el = elements[0] 
 853      if el.tail and el.tail.strip(): 
 854          raise etree.ParserError( 
 855              "Element followed by text: %r" % el.tail) 
 856      el.tail = None 
 857      return el 
  858   
 859   
 860 -def fromstring(html, base_url=None, parser=None, **kw): 
  926   
 927   
 928 -def parse(filename_or_url, parser=None, base_url=None, **kw): 
  929      """ 
 930      Parse a filename, URL, or file-like object into an HTML document 
 931      tree.  Note: this returns a tree, not an element.  Use 
 932      ``parse(...).getroot()`` to get the document root. 
 933   
 934      You can override the base URL with the ``base_url`` keyword.  This 
 935      is most useful when parsing from a file-like object. 
 936      """ 
 937      if parser is None: 
 938          parser = html_parser 
 939      return etree.parse(filename_or_url, parser, base_url=base_url, **kw) 
  940   
 949   
 952      if isinstance(el, etree.CommentBase): 
 953          return 'comment' 
 954      elif isinstance(el, basestring): 
 955          return 'string' 
 956      else: 
 957          return _nons(el.tag) 
  958   
1078   
1079   
1080  HtmlElementClassLookup._default_element_classes['form'] = FormElement 
1119   
1122      if not url: 
1123          raise ValueError("cannot submit, no URL provided") 
1124       
1125      try: 
1126          from urllib import urlencode, urlopen 
1127      except ImportError:  
1128          from urllib.request import urlopen 
1129          from urllib.parse import urlencode 
1130      if method == 'GET': 
1131          if '?' in url: 
1132              url += '&' 
1133          else: 
1134              url += '?' 
1135          url += urlencode(values) 
1136          data = None 
1137      else: 
1138          data = urlencode(values) 
1139          if not isinstance(data, bytes): 
1140              data = data.encode('ASCII') 
1141      return urlopen(url, data) 
 1142   
1145   
1153          raise KeyError( 
1154              "You cannot remove keys from ElementDict") 
 1158          return item in self.inputs 
 1163   
1165          return '<%s for form %s>' % ( 
1166              self.__class__.__name__, 
1167              self.inputs.form._name()) 
  1168   
1235   
1266   
1267   
1268 -class TextareaElement(InputMixin, HtmlElement): 
 1269      """ 
1270      ``<textarea>`` element.  You can get the name with ``.name`` and 
1271      get/set the value with ``.value`` 
1272      """ 
1273      @property 
1275          """ 
1276          Get/set the value (which is the contents of this element) 
1277          """ 
1278          content = self.text or '' 
1279          if self.tag.startswith("{%s}" % XHTML_NAMESPACE): 
1280              serialisation_method = 'xml' 
1281          else: 
1282              serialisation_method = 'html' 
1283          for el in self: 
1284               
1285              content += etree.tostring( 
1286                  el, method=serialisation_method, encoding='unicode') 
1287          return content 
 1288   
1289      @value.setter 
1290 -    def value(self, value): 
 1291          del self[:] 
1292          self.text = value 
 1293   
1294      @value.deleter 
1296          self.text = '' 
1297          del self[:] 
  1298   
1299   
1300  HtmlElementClassLookup._default_element_classes['textarea'] = TextareaElement 
1304      """ 
1305      ``<select>`` element.  You can get the name with ``.name``. 
1306   
1307      ``.value`` will be the value of the selected option, unless this 
1308      is a multi-select element (``<select multiple>``), in which case 
1309      it will be a set-like object.  In either case ``.value_options`` 
1310      gives the possible values. 
1311   
1312      The boolean attribute ``.multiple`` shows if this is a 
1313      multi-select. 
1314      """ 
1315      @property 
1317          """ 
1318          Get/set the value of this select (the selected option). 
1319   
1320          If this is a multi-select, this is a set-like object that 
1321          represents all the selected options. 
1322          """ 
1323          if self.multiple: 
1324              return MultipleSelectOptions(self) 
1325          options = _options_xpath(self) 
1326   
1327          try: 
1328              selected_option = next(el for el in reversed(options) if el.get('selected') is not None) 
1329          except StopIteration: 
1330              try: 
1331                  selected_option = next(el for el in options if el.get('disabled') is None) 
1332              except StopIteration: 
1333                  return None 
1334          value = selected_option.get('value') 
1335          if value is None: 
1336              value = (selected_option.text or '').strip() 
1337          return value 
 1338   
1339      @value.setter 
1340 -    def value(self, value): 
 1341          if self.multiple: 
1342              if isinstance(value, basestring): 
1343                  raise TypeError("You must pass in a sequence") 
1344              values = self.value 
1345              values.clear() 
1346              values.update(value) 
1347              return 
1348          checked_option = None 
1349          if value is not None: 
1350              for el in _options_xpath(self): 
1351                  opt_value = el.get('value') 
1352                  if opt_value is None: 
1353                      opt_value = (el.text or '').strip() 
1354                  if opt_value == value: 
1355                      checked_option = el 
1356                      break 
1357              else: 
1358                  raise ValueError( 
1359                      "There is no option with the value of %r" % value) 
1360          for el in _options_xpath(self): 
1361              if 'selected' in el.attrib: 
1362                  del el.attrib['selected'] 
1363          if checked_option is not None: 
1364              checked_option.set('selected', '') 
 1365   
1366      @value.deleter 
1373   
1374      @property 
1387   
1388      @property 
1390          """ 
1391          Boolean attribute: is there a ``multiple`` attribute on this element. 
1392          """ 
1393          return 'multiple' in self.attrib 
 1394   
1395      @multiple.setter 
1397          if value: 
1398              self.set('multiple', '') 
1399          elif 'multiple' in self.attrib: 
1400              del self.attrib['multiple'] 
  1401   
1402   
1403  HtmlElementClassLookup._default_element_classes['select'] = SelectElement 
1407      """ 
1408      Represents all the selected options in a ``<select multiple>`` element. 
1409   
1410      You can add to this set-like option to select an option, or remove 
1411      to unselect the option. 
1412      """ 
1413   
1415          self.select = select 
 1416   
1417      @property 
1419          """ 
1420          Iterator of all the ``<option>`` elements. 
1421          """ 
1422          return iter(_options_xpath(self.select)) 
 1423   
1425          for option in self.options: 
1426              if 'selected' in option.attrib: 
1427                  opt_value = option.get('value') 
1428                  if opt_value is None: 
1429                      opt_value = (option.text or '').strip() 
1430                  yield opt_value 
 1431   
1432 -    def add(self, item): 
 1433          for option in self.options: 
1434              opt_value = option.get('value') 
1435              if opt_value is None: 
1436                  opt_value = (option.text or '').strip() 
1437              if opt_value == item: 
1438                  option.set('selected', '') 
1439                  break 
1440          else: 
1441              raise ValueError( 
1442                  "There is no option with the value %r" % item) 
 1443   
1445          for option in self.options: 
1446              opt_value = option.get('value') 
1447              if opt_value is None: 
1448                  opt_value = (option.text or '').strip() 
1449              if opt_value == item: 
1450                  if 'selected' in option.attrib: 
1451                      del option.attrib['selected'] 
1452                  else: 
1453                      raise ValueError( 
1454                          "The option %r is not currently selected" % item) 
1455                  break 
1456          else: 
1457              raise ValueError( 
1458                  "There is not option with the value %r" % item) 
 1459   
1461          return '<%s {%s} for select name=%r>' % ( 
1462              self.__class__.__name__, 
1463              ', '.join([repr(v) for v in self]), 
1464              self.select.name) 
  1465   
1468      """ 
1469      This object represents several ``<input type=radio>`` elements 
1470      that have the same name. 
1471   
1472      You can use this like a list, but also use the property 
1473      ``.value`` to check/uncheck inputs.  Also you can use 
1474      ``.value_options`` to get the possible values. 
1475      """ 
1476      @property 
1478          """ 
1479          Get/set the value, which checks the radio with that value (and 
1480          unchecks any other value). 
1481          """ 
1482          for el in self: 
1483              if 'checked' in el.attrib: 
1484                  return el.get('value') 
1485          return None 
 1486   
1487      @value.setter 
1488 -    def value(self, value): 
 1489          checked_option = None 
1490          if value is not None: 
1491              for el in self: 
1492                  if el.get('value') == value: 
1493                      checked_option = el 
1494                      break 
1495              else: 
1496                  raise ValueError("There is no radio input with the value %r" % value) 
1497          for el in self: 
1498              if 'checked' in el.attrib: 
1499                  del el.attrib['checked'] 
1500          if checked_option is not None: 
1501              checked_option.set('checked', '') 
 1502   
1503      @value.deleter 
1506   
1507      @property 
1509          """ 
1510          Returns a list of all the possible values. 
1511          """ 
1512          return [el.get('value') for el in self] 
 1513   
1515          return '%s(%s)' % ( 
1516              self.__class__.__name__, 
1517              list.__repr__(self)) 
  1518   
1521      """ 
1522      Represents a group of checkboxes (``<input type=checkbox>``) that 
1523      have the same name. 
1524   
1525      In addition to using this like a list, the ``.value`` attribute 
1526      returns a set-like object that you can add to or remove from to 
1527      check and uncheck checkboxes.  You can also use ``.value_options`` 
1528      to get the possible values. 
1529      """ 
1530      @property 
1532          """ 
1533          Return a set-like object that can be modified to check or 
1534          uncheck individual checkboxes according to their value. 
1535          """ 
1536          return CheckboxValues(self) 
 1537   
1538      @value.setter 
1539 -    def value(self, value): 
 1547   
1548      @value.deleter 
1551   
1552      @property 
1554          """ 
1555          Returns a list of all the possible values. 
1556          """ 
1557          return [el.get('value') for el in self] 
 1558   
1560          return '%s(%s)' % ( 
1561              self.__class__.__name__, list.__repr__(self)) 
  1562   
1565      """ 
1566      Represents the values of the checked checkboxes in a group of 
1567      checkboxes with the same name. 
1568      """ 
1569   
1572   
1574          return iter([ 
1575              el.get('value') 
1576              for el in self.group 
1577              if 'checked' in el.attrib]) 
 1578   
1579 -    def add(self, value): 
 1580          for el in self.group: 
1581              if el.get('value') == value: 
1582                  el.set('checked', '') 
1583                  break 
1584          else: 
1585              raise KeyError("No checkbox with value %r" % value) 
 1586   
1588          for el in self.group: 
1589              if el.get('value') == value: 
1590                  if 'checked' in el.attrib: 
1591                      del el.attrib['checked'] 
1592                  else: 
1593                      raise KeyError( 
1594                          "The checkbox with value %r was already unchecked" % value) 
1595                  break 
1596          else: 
1597              raise KeyError( 
1598                  "No checkbox with value %r" % value) 
 1599   
1601          return '<%s {%s} for checkboxes name=%r>' % ( 
1602              self.__class__.__name__, 
1603              ', '.join([repr(v) for v in self]), 
1604              self.group.name) 
  1605   
1699   
1700   
1701  HtmlElementClassLookup._default_element_classes['input'] = InputElement 
1705      """ 
1706      Represents a ``<label>`` element. 
1707   
1708      Label elements are linked to other elements with their ``for`` 
1709      attribute.  You can access this element with ``label.for_element``. 
1710      """ 
1711      @property 
1713          """ 
1714          Get/set the element this label points to.  Return None if it 
1715          can't be found. 
1716          """ 
1717          id = self.get('for') 
1718          if not id: 
1719              return None 
1720          return self.body.get_element_by_id(id) 
 1721   
1722      @for_element.setter 
1724          id = other.get('id') 
1725          if not id: 
1726              raise TypeError( 
1727                  "Element %r has no id attribute" % other) 
1728          self.set('for', id) 
 1729   
1730      @for_element.deleter 
 1735   
1736   
1737  HtmlElementClassLookup._default_element_classes['label'] = LabelElement 
1757   
1760      """Convert all tags in an XHTML tree to HTML by removing their 
1761      XHTML namespace. 
1762      """ 
1763      try: 
1764          xhtml = xhtml.getroot() 
1765      except AttributeError: 
1766          pass 
1767      prefix = "{%s}" % XHTML_NAMESPACE 
1768      prefix_len = len(prefix) 
1769      for el in xhtml.iter(prefix + "*"): 
1770          el.tag = el.tag[prefix_len:] 
 1771   
1772   
1773   
1774   
1775  __str_replace_meta_content_type = re.compile( 
1776      r'<meta http-equiv="Content-Type"[^>]*>').sub 
1777  __bytes_replace_meta_content_type = re.compile( 
1778      r'<meta http-equiv="Content-Type"[^>]*>'.encode('ASCII')).sub 
1779   
1780   
1781 -def tostring(doc, pretty_print=False, include_meta_content_type=False, 
1782               encoding=None, method="html", with_tail=True, doctype=None): 
 1783      """Return an HTML string representation of the document. 
1784   
1785      Note: if include_meta_content_type is true this will create a 
1786      ``<meta http-equiv="Content-Type" ...>`` tag in the head; 
1787      regardless of the value of include_meta_content_type any existing 
1788      ``<meta http-equiv="Content-Type" ...>`` tag will be removed 
1789   
1790      The ``encoding`` argument controls the output encoding (defauts to 
1791      ASCII, with &#...; character references for any characters outside 
1792      of ASCII).  Note that you can pass the name ``'unicode'`` as 
1793      ``encoding`` argument to serialise to a Unicode string. 
1794   
1795      The ``method`` argument defines the output method.  It defaults to 
1796      'html', but can also be 'xml' for xhtml output, or 'text' to 
1797      serialise to plain text without markup. 
1798   
1799      To leave out the tail text of the top-level element that is being 
1800      serialised, pass ``with_tail=False``. 
1801   
1802      The ``doctype`` option allows passing in a plain string that will 
1803      be serialised before the XML tree.  Note that passing in non 
1804      well-formed content here will make the XML output non well-formed. 
1805      Also, an existing doctype in the document tree will not be removed 
1806      when serialising an ElementTree instance. 
1807   
1808      Example:: 
1809   
1810          >>> from lxml import html 
1811          >>> root = html.fragment_fromstring('<p>Hello<br>world!</p>') 
1812   
1813          >>> html.tostring(root) 
1814          b'<p>Hello<br>world!</p>' 
1815          >>> html.tostring(root, method='html') 
1816          b'<p>Hello<br>world!</p>' 
1817   
1818          >>> html.tostring(root, method='xml') 
1819          b'<p>Hello<br/>world!</p>' 
1820   
1821          >>> html.tostring(root, method='text') 
1822          b'Helloworld!' 
1823   
1824          >>> html.tostring(root, method='text', encoding='unicode') 
1825          u'Helloworld!' 
1826   
1827          >>> root = html.fragment_fromstring('<div><p>Hello<br>world!</p>TAIL</div>') 
1828          >>> html.tostring(root[0], method='text', encoding='unicode') 
1829          u'Helloworld!TAIL' 
1830   
1831          >>> html.tostring(root[0], method='text', encoding='unicode', with_tail=False) 
1832          u'Helloworld!' 
1833   
1834          >>> doc = html.document_fromstring('<p>Hello<br>world!</p>') 
1835          >>> html.tostring(doc, method='html', encoding='unicode') 
1836          u'<html><body><p>Hello<br>world!</p></body></html>' 
1837   
1838          >>> print(html.tostring(doc, method='html', encoding='unicode', 
1839          ...          doctype='<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"' 
1840          ...                  ' "http://www.w3.org/TR/html4/strict.dtd">')) 
1841          <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd"> 
1842          <html><body><p>Hello<br>world!</p></body></html> 
1843      """ 
1844      html = etree.tostring(doc, method=method, pretty_print=pretty_print, 
1845                            encoding=encoding, with_tail=with_tail, 
1846                            doctype=doctype) 
1847      if method == 'html' and not include_meta_content_type: 
1848          if isinstance(html, str): 
1849              html = __str_replace_meta_content_type('', html) 
1850          else: 
1851              html = __bytes_replace_meta_content_type(bytes(), html) 
1852      return html 
 1853   
1854   
1855  tostring.__doc__ = __fix_docstring(tostring.__doc__) 
1859      """ 
1860      Open the HTML document in a web browser, saving it to a temporary 
1861      file to open it.  Note that this does not delete the file after 
1862      use.  This is mainly meant for debugging. 
1863      """ 
1864      import os 
1865      import webbrowser 
1866      import tempfile 
1867      if not isinstance(doc, etree._ElementTree): 
1868          doc = etree.ElementTree(doc) 
1869      handle, fn = tempfile.mkstemp(suffix='.html') 
1870      f = os.fdopen(handle, 'wb') 
1871      try: 
1872          doc.write(f, method="html", encoding=encoding or doc.docinfo.encoding or "UTF-8") 
1873      finally: 
1874           
1875          f.close() 
1876      url = 'file://' + fn.replace(os.path.sep, '/') 
1877      print(url) 
1878      webbrowser.open(url) 
 1879   
1880   
1881   
1882   
1883   
1884   
1885 -class HTMLParser(etree.HTMLParser): 
 1886      """An HTML parser that is configured to return lxml.html Element 
1887      objects. 
1888      """ 
 1892   
1895      """An XML parser that is configured to return lxml.html Element 
1896      objects. 
1897   
1898      Note that this parser is not really XHTML aware unless you let it 
1899      load a DTD that declares the HTML entities.  To do this, make sure 
1900      you have the XHTML DTDs installed in your catalogs, and create the 
1901      parser like this:: 
1902   
1903          >>> parser = XHTMLParser(load_dtd=True) 
1904   
1905      If you additionally want to validate the document, use this:: 
1906   
1907          >>> parser = XHTMLParser(dtd_validation=True) 
1908   
1909      For catalog support, see http://www.xmlsoft.org/catalog.html. 
1910      """ 
 1914   
1917      """Create a new HTML Element. 
1918   
1919      This can also be used for XHTML documents. 
1920      """ 
1921      v = html_parser.makeelement(*args, **kw) 
1922      return v 
 1923   
1924   
1925  html_parser = HTMLParser() 
1926  xhtml_parser = XHTMLParser() 
1927