parser.py

Go to the documentation of this file.
00001 from silme.core.entity import EntityList, Entity
00002 from structure import *
00003 
00004 from xml.dom import minidom
00005 from xml.dom import Node
00006 
00007 class HTMLParser():
00008 
00009     @classmethod
00010     def parse(cls, text, code='default'):
00011         dtd = HTMLStructure()
00012         cls.build_element_list(text, dtd, code=code)
00013         dtd.fallback = code
00014         return dtd
00015 
00016     @classmethod
00017     def parse_to_entitylist(cls, text, code='default'):
00018         entitylist = EntityList()
00019         xmldoc = minidom.parseString(text)
00020         nodes = xmldoc.getElementsByTagName('*')
00021         for node in nodes:
00022             for cnode in node.childNodes:
00023                 if cnode.nodeType == node.TEXT_NODE:
00024                     entitylist.add_entity(cls.parse_entity(cnode,  code))
00025         return entitylist
00026 
00027     @classmethod
00028     def parse_entity(cls, node, code='default'):
00029         id = abs_path(cnode)
00030         val = cnode.data
00031         return Entity(id,  val,  code)
00032 
00033     @classmethod
00034     def build_element_list (cls, text, object, type='comment', code='default', pointer=0, end=None):
00035         raise NotImplementedError()
00036 
00037 #Mapping from node type to XPath node test function name
00038 OTHER_NODES = {
00039     Node.TEXT_NODE: 'text',
00040     Node.COMMENT_NODE: 'comment',
00041     Node.PROCESSING_INSTRUCTION_NODE: 'processing-instruction'
00042     }
00043 
00044 def abs_path(node):
00045     """
00046     Return an XPath expression that provides a unique path to
00047     the given node (supports elements, attributes, root nodes,
00048     text nodes, comments and PIs) within a document
00049     """
00050     if node.nodeType == Node.ELEMENT_NODE:
00051         count = 1
00052         #Count previous siblings with same node name
00053         previous = node.previousSibling
00054         while previous:
00055             if previous.localName == node.localName: count += 1
00056             previous = previous.previousSibling
00057         step = u'%s[%i]' % (node.nodeName, count)
00058         ancestor = node.parentNode
00059     elif node.nodeType == Node.ATTRIBUTE_NODE:
00060         step = u'@%s' % (node.nodeName)
00061         ancestor = node.ownerElement
00062     elif node.nodeType in OTHER_NODES:
00063         #Text nodes, comments and PIs
00064         count = 1
00065         #Count previous siblings of the same node type
00066         previous = node.previousSibling
00067         while previous:
00068             if previous.nodeType == node.nodeType: count += 1
00069             previous = previous.previousSibling
00070         test_func = OTHER_NODES[node.nodeType]
00071         step = u'%s()[%i]' % (test_func, count)
00072         ancestor = node.parentNode
00073     elif not node.parentNode:
00074         #Root node
00075         step = u''
00076         ancestor = node
00077     else:
00078         raise TypeError('Unsupported node type for abs_path')
00079     if ancestor.parentNode:
00080         return abs_path(ancestor) + u'/' + step
00081     else:
00082         return u'/' + step
00083 

Generated on Tue May 12 17:37:27 2009 for silme by  doxygen 1.5.8