parser.py
Go to the documentation of this file.00001 from silme.core.entity import EntityList, Entity
00002 from structure import *
00003
00004 from xml.dom import minidom
00005 from xml.dom import Node
00006
00007 class HTMLParser():
00008
00009 @classmethod
00010 def parse(cls, text, code='default'):
00011 dtd = HTMLStructure()
00012 cls.build_element_list(text, dtd, code=code)
00013 dtd.fallback = code
00014 return dtd
00015
00016 @classmethod
00017 def parse_to_entitylist(cls, text, code='default'):
00018 entitylist = EntityList()
00019 xmldoc = minidom.parseString(text)
00020 nodes = xmldoc.getElementsByTagName('*')
00021 for node in nodes:
00022 for cnode in node.childNodes:
00023 if cnode.nodeType == node.TEXT_NODE:
00024 entitylist.add_entity(cls.parse_entity(cnode, code))
00025 return entitylist
00026
00027 @classmethod
00028 def parse_entity(cls, node, code='default'):
00029 id = abs_path(cnode)
00030 val = cnode.data
00031 return Entity(id, val, code)
00032
00033 @classmethod
00034 def build_element_list (cls, text, object, type='comment', code='default', pointer=0, end=None):
00035 raise NotImplementedError()
00036
00037
00038 OTHER_NODES = {
00039 Node.TEXT_NODE: 'text',
00040 Node.COMMENT_NODE: 'comment',
00041 Node.PROCESSING_INSTRUCTION_NODE: 'processing-instruction'
00042 }
00043
00044 def abs_path(node):
00045 """
00046 Return an XPath expression that provides a unique path to
00047 the given node (supports elements, attributes, root nodes,
00048 text nodes, comments and PIs) within a document
00049 """
00050 if node.nodeType == Node.ELEMENT_NODE:
00051 count = 1
00052
00053 previous = node.previousSibling
00054 while previous:
00055 if previous.localName == node.localName: count += 1
00056 previous = previous.previousSibling
00057 step = u'%s[%i]' % (node.nodeName, count)
00058 ancestor = node.parentNode
00059 elif node.nodeType == Node.ATTRIBUTE_NODE:
00060 step = u'@%s' % (node.nodeName)
00061 ancestor = node.ownerElement
00062 elif node.nodeType in OTHER_NODES:
00063
00064 count = 1
00065
00066 previous = node.previousSibling
00067 while previous:
00068 if previous.nodeType == node.nodeType: count += 1
00069 previous = previous.previousSibling
00070 test_func = OTHER_NODES[node.nodeType]
00071 step = u'%s()[%i]' % (test_func, count)
00072 ancestor = node.parentNode
00073 elif not node.parentNode:
00074
00075 step = u''
00076 ancestor = node
00077 else:
00078 raise TypeError('Unsupported node type for abs_path')
00079 if ancestor.parentNode:
00080 return abs_path(ancestor) + u'/' + step
00081 else:
00082 return u'/' + step
00083