opt
/
hc_python
/
lib
/
python3.12
/
site-packages
/
lxml
/
html
/
Go to Home Directory
+
Upload
Create File
root@0UT1S:~$
Execute
By Order of Mr.0UT1S
[DIR] ..
N/A
[DIR] __pycache__
N/A
ElementSoup.py
320 bytes
Rename
Delete
__init__.py
62.79 KB
Rename
Delete
_diffcommand.py
2.03 KB
Rename
Delete
_html5builder.py
3.15 KB
Rename
Delete
_setmixin.py
1.16 KB
Rename
Delete
builder.py
4.39 KB
Rename
Delete
clean.py
503 bytes
Rename
Delete
defs.py
4.14 KB
Rename
Delete
diff.cpython-312-x86_64-linux-gnu.so
352.18 KB
Rename
Delete
diff.py
29.68 KB
Rename
Delete
formfill.py
9.45 KB
Rename
Delete
html5parser.py
8.43 KB
Rename
Delete
soupparser.py
9.96 KB
Rename
Delete
usedoctest.py
249 bytes
Rename
Delete
"""External interface to the BeautifulSoup HTML parser. """ __all__ = ["fromstring", "parse", "convert_tree"] import re from lxml import etree, html try: from bs4 import ( BeautifulSoup, Tag, Comment, ProcessingInstruction, NavigableString, Declaration, Doctype) _DECLARATION_OR_DOCTYPE = (Declaration, Doctype) except ImportError: from BeautifulSoup import ( BeautifulSoup, Tag, Comment, ProcessingInstruction, NavigableString, Declaration) _DECLARATION_OR_DOCTYPE = Declaration def fromstring(data, beautifulsoup=None, makeelement=None, **bsargs): """Parse a string of HTML data into an Element tree using the BeautifulSoup parser. Returns the root ``<html>`` Element of the tree. You can pass a different BeautifulSoup parser through the `beautifulsoup` keyword, and a diffent Element factory function through the `makeelement` keyword. By default, the standard ``BeautifulSoup`` class and the default factory of `lxml.html` are used. """ return _parse(data, beautifulsoup, makeelement, **bsargs) def parse(file, beautifulsoup=None, makeelement=None, **bsargs): """Parse a file into an ElemenTree using the BeautifulSoup parser. You can pass a different BeautifulSoup parser through the `beautifulsoup` keyword, and a diffent Element factory function through the `makeelement` keyword. By default, the standard ``BeautifulSoup`` class and the default factory of `lxml.html` are used. """ if not hasattr(file, 'read'): file = open(file) root = _parse(file, beautifulsoup, makeelement, **bsargs) return etree.ElementTree(root) def convert_tree(beautiful_soup_tree, makeelement=None): """Convert a BeautifulSoup tree to a list of Element trees. Returns a list instead of a single root Element to support HTML-like soup with more than one root element. You can pass a different Element factory through the `makeelement` keyword. """ root = _convert_tree(beautiful_soup_tree, makeelement) children = root.getchildren() for child in children: root.remove(child) return children # helpers def _parse(source, beautifulsoup, makeelement, **bsargs): if beautifulsoup is None: beautifulsoup = BeautifulSoup if hasattr(beautifulsoup, "HTML_ENTITIES"): # bs3 if 'convertEntities' not in bsargs: bsargs['convertEntities'] = 'html' if hasattr(beautifulsoup, "DEFAULT_BUILDER_FEATURES"): # bs4 if 'features' not in bsargs: bsargs['features'] = 'html.parser' # use Python html parser tree = beautifulsoup(source, **bsargs) root = _convert_tree(tree, makeelement) # from ET: wrap the document in a html root element, if necessary if len(root) == 1 and root[0].tag == "html": return root[0] root.tag = "html" return root _parse_doctype_declaration = re.compile( r'(?:\s|[<!])*DOCTYPE\s*HTML' r'(?:\s+PUBLIC)?(?:\s+(\'[^\']*\'|"[^"]*"))?' r'(?:\s+(\'[^\']*\'|"[^"]*"))?', re.IGNORECASE).match class _PseudoTag: # Minimal imitation of BeautifulSoup.Tag def __init__(self, contents): self.name = 'html' self.attrs = [] self.contents = contents def __iter__(self): return self.contents.__iter__() def _convert_tree(beautiful_soup_tree, makeelement): if makeelement is None: makeelement = html.html_parser.makeelement # Split the tree into three parts: # i) everything before the root element: document type # declaration, comments, processing instructions, whitespace # ii) the root(s), # iii) everything after the root: comments, processing # instructions, whitespace first_element_idx = last_element_idx = None html_root = declaration = None for i, e in enumerate(beautiful_soup_tree): if isinstance(e, Tag): if first_element_idx is None: first_element_idx = i last_element_idx = i if html_root is None and e.name and e.name.lower() == 'html': html_root = e elif declaration is None and isinstance(e, _DECLARATION_OR_DOCTYPE): declaration = e # For a nice, well-formatted document, the variable roots below is # a list consisting of a single <html> element. However, the document # may be a soup like '<meta><head><title>Hello</head><body>Hi # all<\p>'. In this example roots is a list containing meta, head # and body elements. if first_element_idx is None: pre_root = post_root = [] roots = beautiful_soup_tree.contents else: pre_root = beautiful_soup_tree.contents[:first_element_idx] roots = beautiful_soup_tree.contents[first_element_idx:last_element_idx+1] post_root = beautiful_soup_tree.contents[last_element_idx+1:] # Reorganize so that there is one <html> root... if html_root is not None: # ... use existing one if possible, ... i = roots.index(html_root) html_root.contents = roots[:i] + html_root.contents + roots[i+1:] else: # ... otherwise create a new one. html_root = _PseudoTag(roots) convert_node = _init_node_converters(makeelement) # Process pre_root res_root = convert_node(html_root) prev = res_root for e in reversed(pre_root): converted = convert_node(e) if converted is not None: prev.addprevious(converted) prev = converted # ditto for post_root prev = res_root for e in post_root: converted = convert_node(e) if converted is not None: prev.addnext(converted) prev = converted if declaration is not None: try: # bs4 provides full Doctype string doctype_string = declaration.output_ready() except AttributeError: doctype_string = declaration.string match = _parse_doctype_declaration(doctype_string) if not match: # Something is wrong if we end up in here. Since soupparser should # tolerate errors, do not raise Exception, just let it pass. pass else: external_id, sys_uri = match.groups() docinfo = res_root.getroottree().docinfo # strip quotes and update DOCTYPE values (any of None, '', '...') docinfo.public_id = external_id and external_id[1:-1] docinfo.system_url = sys_uri and sys_uri[1:-1] return res_root def _init_node_converters(makeelement): converters = {} ordered_node_types = [] def converter(*types): def add(handler): for t in types: converters[t] = handler ordered_node_types.append(t) return handler return add def find_best_converter(node): for t in ordered_node_types: if isinstance(node, t): return converters[t] return None def convert_node(bs_node, parent=None): # duplicated in convert_tag() below try: handler = converters[type(bs_node)] except KeyError: handler = converters[type(bs_node)] = find_best_converter(bs_node) if handler is None: return None return handler(bs_node, parent) def map_attrs(bs_attrs): if isinstance(bs_attrs, dict): # bs4 attribs = {} for k, v in bs_attrs.items(): if isinstance(v, list): v = " ".join(v) attribs[k] = unescape(v) else: attribs = {k: unescape(v) for k, v in bs_attrs} return attribs def append_text(parent, text): if len(parent) == 0: parent.text = (parent.text or '') + text else: parent[-1].tail = (parent[-1].tail or '') + text # converters are tried in order of their definition @converter(Tag, _PseudoTag) def convert_tag(bs_node, parent): attrs = bs_node.attrs if parent is not None: attribs = map_attrs(attrs) if attrs else None res = etree.SubElement(parent, bs_node.name, attrib=attribs) else: attribs = map_attrs(attrs) if attrs else {} res = makeelement(bs_node.name, attrib=attribs) for child in bs_node: # avoid double recursion by inlining convert_node(), see above try: handler = converters[type(child)] except KeyError: pass else: if handler is not None: handler(child, res) continue convert_node(child, res) return res @converter(Comment) def convert_comment(bs_node, parent): res = html.HtmlComment(bs_node) if parent is not None: parent.append(res) return res @converter(ProcessingInstruction) def convert_pi(bs_node, parent): if bs_node.endswith('?'): # The PI is of XML style (<?as df?>) but BeautifulSoup # interpreted it as being SGML style (<?as df>). Fix. bs_node = bs_node[:-1] res = etree.ProcessingInstruction(*bs_node.split(' ', 1)) if parent is not None: parent.append(res) return res @converter(NavigableString) def convert_text(bs_node, parent): if parent is not None: append_text(parent, unescape(bs_node)) return None return convert_node # copied from ET's ElementSoup try: from html.entities import name2codepoint # Python 3 except ImportError: from htmlentitydefs import name2codepoint handle_entities = re.compile(r"&(\w+);").sub try: unichr except NameError: # Python 3 unichr = chr def unescape(string): if not string: return '' # work around oddities in BeautifulSoup's entity handling def unescape_entity(m): try: return unichr(name2codepoint[m.group(1)]) except KeyError: return m.group(0) # use as is return handle_entities(unescape_entity, string)
Save