00001 from silme.core.entity import EntityList, Entity
00002 from structure import *
00003 import re
00004
00005 class DTDParser():
00006 name_start_char = u':A-Z_a-z\xC0-\xD6\xD8-\xF6\xF8-\u02FF' + \
00007 u'\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF'+\
00008 u'\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD'
00009 name_char = name_start_char + ur'\-\.0-9' + u'\xB7\u0300-\u036F\u203F-\u2040'
00010 name = u'[' + name_start_char + u'][' + name_char + u']*'
00011
00012 patterns = {}
00013 patterns['entity'] = re.compile(u'<!ENTITY\s+(' + name + u')\s+((?:\"[^\"]*\")|(?:\'[^\']*\'))\s*>', re.S|re.U)
00014 patterns['comment'] = re.compile(u'<!\s*--(.*?)(?:--\s*>)', re.M|re.S)
00015
00016 @classmethod
00017 def parse(cls, text, code='default'):
00018 dtd = DTDStructure()
00019 cls.build_element_list(text, dtd, code=code)
00020 dtd.fallback = code
00021 return dtd
00022
00023 @classmethod
00024 def parse_to_entitylist(cls, text, code='default'):
00025 entitylist = EntityList()
00026 text = cls.patterns['comment'].sub('', text)
00027 matchlist = cls.patterns['entity'].findall(text)
00028 for match in matchlist:
00029 entitylist.add_entity(Entity(match[0], match[1][1:-1], code))
00030 return entitylist
00031
00032 @classmethod
00033 def parse_entity(cls, text, code='default'):
00034 match = self.patterns['entity'].match(text)
00035 if not match:
00036 raise Exception()
00037 entity = Entity(match.group(0))
00038 entity.set_value(match.group(1)[1:-1], code)
00039 return entity
00040
00041
00042 @classmethod
00043 def build_element_list (cls, text, object, type='comment', code='default', pointer=0, end=None):
00044 cls.split_comments(text, object, code)
00045
00046 @classmethod
00047 def split_comments (cls, text, object, code='default', pointer=0, end=None):
00048 pattern = cls.patterns['comment']
00049 if end:
00050 match = pattern.search(text, pointer, end)
00051 else:
00052 match = pattern.search(text, pointer)
00053 while match:
00054 st0 = match.start(0)
00055 if st0 > pointer:
00056 cls.split_entities(text, object, code=code, pointer=pointer, end=st0)
00057 comment = Comment()
00058 cls.split_entities(match.group(1), comment, code=code)
00059 object.append(comment)
00060 pointer = match.end(0)
00061 if end:
00062 match = pattern.search(text, pointer, end)
00063 else:
00064 match = pattern.search(text, pointer)
00065 if len(text) > pointer:
00066 cls.split_entities(text, object, code=code, pointer=pointer)
00067
00068 @classmethod
00069 def split_entities (cls, text, object, code='default', pointer=0, end=None):
00070 pattern = cls.patterns['entity']
00071 if end:
00072 match = pattern.search(text, pointer, end)
00073 else:
00074 match = pattern.search(text, pointer)
00075 while match:
00076 st0 = match.start(0)
00077 if st0 > pointer:
00078 object.append(text[pointer:st0])
00079 groups = match.groups()
00080 entity = Entity(groups[0])
00081 entity.set_value(groups[1][1:-1], code)
00082 entity.params['source'] = {'type':'dtd',
00083 'string':match.group(0),
00084 'valpos':match.start(2)+1-st0}
00085 object.append(entity)
00086 pointer = match.end(0)
00087 if end:
00088 match = pattern.search(text, pointer, end)
00089 else:
00090 match = pattern.search(text, pointer)
00091 if (not end or (end > pointer)) and len(text) > pointer:
00092 if end:
00093 object.append(text[pointer:end])
00094 else:
00095 object.append(text[pointer:])