Source code for dhtmlparser

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Imports =====================================================================
import gc

from . import specialdict
from . import htmlelement

from .htmlelement import HTMLElement
from .htmlelement import _rotate_buff


# Functions ===================================================================
[docs]class StateEnum(object): _cnt = (x for x in range(100)) content = next(_cnt) tag = next(_cnt) parameter = next(_cnt) comment = next(_cnt)
[docs]def first(inp_data): """ Return first element from `inp_data`, or raise StopIteration. Note: This function was created because it works for generators, lists, iterators, tuples and so on same way, which indexing doesn't. Also it have smaller cost than list(generator)[0], because it doesn't convert whole `inp_data` to list. Args: inp_data (iterable): Any iterable object. Raises: StopIteration: When the `inp_data` is blank. """ return next(x for x in inp_data)
def _raw_split(itxt): """ Parse HTML from text into array filled with tags end text. Source code is little bit unintutive, because it is state machine parser. For better understanding, look at http://bit.ly/1rXRcJj Example:: >>> dhtmlparser._raw_split('<html><tag params="true"></html>') ['<html>', '<tag params="true">', '</html>'] Args: itxt (str): Input HTML text, which will be parsed. Returns: list: List of strings (input splitted to tags and text). """ echr = "" buff = ["", "", "", ""] content = "" array = [] next_state = 0 inside_tag = False escaped = False COMMENT_START = ["-", "!", "<"] COMMENT_END = ["-", "-"] gc.disable() for c in itxt: # content if next_state == StateEnum.content: if c == "<": if content: array.append(content) content = c next_state = StateEnum.tag inside_tag = False else: content += c # html tag elif next_state == StateEnum.tag: if c == ">": array.append(content + c) content = "" next_state = StateEnum.content elif c == "'" or c == '"': echr = c content += c next_state = StateEnum.parameter elif c == "-" and buff[:3] == COMMENT_START: if content[:-3]: array.append(content[:-3]) content = content[-3:] + c next_state = StateEnum.comment else: if c == "<": # jump back into tag instead of content array.append(content) inside_tag = True content = "" content += c # quotes "" / '' elif next_state == StateEnum.parameter: if c == echr and not escaped: # end of quotes next_state = StateEnum.tag # unescaped end of line - this is good for invalid HTML like # <a href=something">..., because it allows recovery if c == "\n" and not escaped and buff[0] == ">": next_state = StateEnum.content inside_tag = False content += c escaped = not escaped if c == "\\" else False # html comments elif next_state == StateEnum.comment: if c == ">" and buff[:2] == COMMENT_END: next_state = StateEnum.tag if inside_tag else StateEnum.content inside_tag = False array.append(content + c) content = "" else: content += c # rotate buffer buff = _rotate_buff(buff) buff[0] = c gc.enable() if content: array.append(content) return array def _indexOfEndTag(istack): """ Go through `istack` and search endtag. Element at first index is considered as opening tag. Args: istack (list): List of :class:`.HTMLElement` objects. Returns: int: Index of end tag or 0 if not found. """ if len(istack) <= 0: return 0 if not istack[0].isOpeningTag(): return 0 cnt = 0 opener = istack[0] for index, el in enumerate(istack[1:]): if el.isOpeningTag() and \ el.getTagName().lower() == opener.getTagName().lower(): cnt += 1 elif el.isEndTagTo(opener): if cnt == 0: return index + 1 cnt -= 1 return 0 def _parseDOM(istack): """ Recursively go through element array and create DOM. Args: istack (list): List of :class:`.HTMLElement` objects. Returns: list: DOM tree as list. """ ostack = [] end_tag_index = 0 def neither_nonpair_or_end_or_comment(el): return not (el.isNonPairTag() or el.isEndTag() or el.isComment()) index = 0 while index < len(istack): el = istack[index] # check if this is pair tag end_tag_index = _indexOfEndTag(istack[index:]) if end_tag_index == 0 and neither_nonpair_or_end_or_comment(el): el.isNonPairTag(True) if end_tag_index == 0: if not el.isEndTag(): ostack.append(el) else: el.childs = _parseDOM(istack[index + 1: end_tag_index + index]) el.endtag = istack[end_tag_index + index] # reference to endtag el.endtag.openertag = el ostack.append(el) ostack.append(el.endtag) index = end_tag_index + index index += 1 return ostack
[docs]def parseString(txt, cip=True): """ Parse string `txt` and return DOM tree consisting of single linked :class:`.HTMLElement`. Args: txt (str): HTML/XML string, which will be parsed to DOM. cip (bool, default True): Case Insensitive Parameters. Use special dictionary to store :attr:`.HTMLElement.params` as case insensitive. Returns: obj: Single conteiner HTML element with blank tag, which has whole DOM\ in it's :attr:`.HTMLElement.childs` property. This element can be\ queried using :meth:`.HTMLElement.find` functions. """ if isinstance(txt, HTMLElement): return txt # remove UTF BOM (prettify fails if not) if len(txt) > 3 and txt[:3] == u"\xef\xbb\xbf": txt = txt[3:] if not cip: htmlelement.html_parser.SpecialDict = dict elif isinstance(htmlelement.html_parser.SpecialDict, dict): htmlelement.html_parser.SpecialDict = specialdict.SpecialDict container = HTMLElement() container.childs = _parseDOM([ HTMLElement(x) for x in _raw_split(txt) ]) return container
[docs]def makeDoubleLinked(dom, parent=None): """ Standard output from `dhtmlparser` is single-linked tree. This will make it double-linked. Args: dom (obj): :class:`.HTMLElement` instance. parent (obj, default None): Don't use this, it is used in recursive call. """ dom.parent = parent for child in dom.childs: child.parent = dom makeDoubleLinked(child, dom)
[docs]def removeTags(dom): """ Remove all tags from `dom` and obtain plaintext representation. Args: dom (str, obj, array): str, HTMLElement instance or array of elements. Returns: str: Plain string without tags. """ # python 2 / 3 shill try: string_type = basestring except NameError: string_type = str # initialize stack with proper value (based on dom parameter) element_stack = None if type(dom) in [list, tuple]: element_stack = dom elif isinstance(dom, HTMLElement): element_stack = dom.childs if dom.isTag() else [dom] elif isinstance(dom, string_type): element_stack = parseString(dom).childs else: element_stack = dom # remove all tags output = "" while element_stack: el = element_stack.pop(0) if not (el.isTag() or el.isComment() or not el.getTagName()): output += el.__str__() if el.childs: element_stack = el.childs + element_stack return output