Attachment 'machineblock4.py'

Download

   1 #!/usr/bin/python
   2 """MachineCode interpreter.
   3 
   4 Interprets and caches MachineCode from XHTML pages.
   5 
   6 DataBlockStore -- Create DataBlock instances.
   7 DataBlock -- A MachineCode block on a website.
   8 """
   9 
  10 import re
  11 
  12 import pprint
  13 import HTMLParser, htmlentitydefs
  14 
  15 import webcache
  16 
  17 import mcb_rdf
  18 
  19 class DataBlockStore:
  20 
  21     """Creates and caches data blocks.
  22 
  23     When you want a DataBlock, call "get_block". It will return a cached
  24     block, or a new block.
  25 
  26     The DataBlockStore also maintains a shared webpage cache.
  27 
  28     get_block -- Obtain a DataBlock 
  29     """
  30     
  31     def __init__(self, directory = "."):
  32         self.cache = webcache.WebCache(directory + "/pages.db",
  33                                        directory + "/times.db",
  34                                        24*60*60)  # Shared webpage cache
  35         self.blocks = {}  # Blocks cache
  36     
  37     def get_blocks(self, url, baseurl=None):
  38         """Return a DataBlock representing a website's MachineCode.
  39         
  40         If the DataBlock has already been made, return it. If not,
  41         construct it, cache it, and then return it.
  42         """
  43         if not url.startswith("http://") and baseurl:
  44             if url[0]=="#":
  45                 name = url[1:]
  46             else:
  47                 name = url
  48             url = baseurl
  49         else:
  50             name = None
  51             url = url.split("#", 1)
  52             if len(url)==2:
  53                 name = url[1]
  54             url = url[0]
  55 
  56         if not url in self.blocks:
  57             self.read_page(url)
  58 
  59         if name:
  60             result = []
  61             for block in self.blocks[url]:
  62                 if block.get("id", [""])[0]==name:
  63                     result.append(block)
  64             return result
  65         else:
  66             return self.blocks[url]
  67 
  68     def read_page(self, url):
  69         page = self.cache.get_page(url)
  70         parser = MachineCodeHtmlParser()
  71         parser.feed(page)
  72         parser.close()
  73         self.blocks[url] = parser.blocks
  74         for block in self.blocks[url]:
  75             block._url = url
  76             #block.apply_type(self)
  77             #print block.get_required_attributes(self)
  78 
  79 class DataBlock(dict):
  80     strict_types = 0
  81 
  82     def store_value(self, key, value):
  83         if self.has_key(key):
  84             self[key].append(value)
  85         else:
  86             self[key] = [value]
  87 
  88     def get_required_attributes(self, type):
  89 
  90         if not type: return []
  91         attributes = type.get("attribute", [])
  92         result = []
  93         for attr in attributes:
  94             if int(attr.get("required", [1])[0]):
  95                 if attr.has_key("key"):
  96                     result.append(attr["key"][0])
  97         return result
  98 
  99     def get_type(self, store):
 100         type = self.get("type", None)
 101         if not type: return None
 102         type = type[0]
 103         if isinstance(type, str):
 104             type = store.get_blocks(type, self._url)
 105             if not type: return None
 106             type = type[0]
 107         return type
 108 
 109     def get_type_attribute(self, type, key):
 110         for attr in type.get('attribute', []):
 111             k = attr.get('key',[])
 112             if k and k[0]==key: return attr
 113         return None
 114 
 115     def _handle_string(self, value):
 116         if not isinstance(value, str) and self.strict_types:
 117             return None
 118         else:
 119             return value
 120     def _handle_number(self, value):
 121         try:
 122             return float(value)
 123         except ValueError:
 124             print value
 125             if self.strict_types:
 126                 return None
 127             else:
 128                 return value
 129     def _handle_date(self, value):
 130         return value #XXX
 131     def _handle_url(self, value):
 132         value = self._handle_string(value)
 133         # XXX href replacement ._url extention
 134         return value
 135     def _handle_any(self, value):
 136         return value
 137     
 138     def _Handle_block(self, value, block_type, store):
 139         if isinstance(value, DataBlock):
 140             value._url = self._url
 141             type = value.get_type(store)
 142             if type is block_type:
 143                 return value
 144             elif type is None:
 145                 value['type'] = [block_type]
 146             elif self.strict_types:
 147                 return None
 148             value.apply_type(store)
 149         elif self.strict_types:
 150             return None
 151         return value
 152         
 153         
 154     def apply_type(self, store):
 155         type = self.get_type(store)
 156         if not type: return
 157         #req_type = self.get_required_attributes(type)
 158         for key, values in self.iteritems():
 159             attribute = self.get_type_attribute(type, key)
 160             if attribute:
 161                 attr_type = attribute.get("type", ['any'])[0]
 162                 if key=='multiple': print "XX\n", self, '\n', attr_type
 163                 if isinstance(attr_type, str):
 164                     attr_type = attr_type.strip()
 165                     handler = getattr(self, "_handle_%s" % attr_type,
 166                                       None)
 167                     if not handler:
 168                         type_block = store.get_blocks(attr_type, self._url)
 169                         if type_block:
 170                             type_block = type_block[0]
 171                             handler = lambda v: self._Handle_block(
 172                                 v, type_block, store)
 173                         else:
 174                             handler = self._handle_any #XXX Blocks
 175                 elif isinstance(attr_type, DataBlock):
 176                     handler = lambda v: self._Handle_block(
 177                         v, attr_type, store)
 178                     
 179                 values = map(handler, values)
 180 
 181                 if self.strict_types:
 182                     values = filter(lambda x:x is not None, values)
 183                 self[key] = values
 184                 
 185 class MachineCodeHtmlParser(HTMLParser.HTMLParser):
 186 
 187     """Interpret XHTML tag events and store the results in a DataBlock.
 188 
 189     A SAX-like handler for XHTML events.
 190 
 191     The handler waits for the text "MACHINECODE" in bold.
 192 
 193     Then it reads bold-tagged text as dictionary keys, and italic-tagged
 194     text or anchored text as dictionary values.
 195 
 196     A final "MACHINECODE" in bold seals the interpretation.
 197     """
 198 
 199     beginblock = "BEGINBLOCK"
 200     endblock_re = re.compile("\s*ENDBLOCK", re.M)
 201     delim_re = re.compile(r"\s*([:$%])")
 202     delim_quoted = ":"
 203     delim_raw = '%'
 204     delim_block = "$"
 205     delim_end = re.compile(";(?!;)")
 206     key_re = re.compile(r"[\w-]+", re.M)
 207     block_search = object()
 208     key_search = object()
 209     delim_search = object()
 210     raw_html = object()
 211     unquote_html = object()
 212     
 213     def __init__(self):
 214         HTMLParser.HTMLParser.__init__(self)
 215         
 216         self.data_block = None
 217         
 218         self.state = self.block_search 
 219         self.value = []
 220         self.stack = []
 221         self.blocks = []
 222     def handle_starttag(self, tag, attrs):
 223         if self.state==self.raw_html:
 224             a = []
 225             for key, value in attrs:
 226                 a.append(' %s="%s"' % (key, value))
 227             self.value.append("<%s%s>" % (tag, "".join(a)))
 228 
 229     def handle_endtag(self, tag):
 230         if self.state==self.raw_html:
 231             self.value.append("</%s>" % tag)
 232 
 233     def handle_data(self, data):
 234         pos = 0
 235 
 236         while pos<len(data):
 237             # search BEGINBLOCK
 238             if self.state==self.block_search:
 239                 pos = data.find(self.beginblock, pos)
 240                 if pos != -1:
 241                     print "\nBEGINBLOCK"
 242                     pos += len(self.beginblock)
 243                     self.state = self.key_search
 244                     if self.data_block: self.stack.append(self.data_block)
 245                     self.data_block = DataBlock()
 246                 else:
 247                     return
 248 
 249             # search KEY
 250             if self.state==self.key_search:
 251                 # ENDBLOCK
 252                 match = self.endblock_re.match(data, pos)
 253                 if match:
 254                     print "ENDBLOCK"
 255                     pos = match.end()
 256                     if self.stack:
 257                         value_block = self.data_block
 258                         self.data_block = self.stack.pop()
 259                         self.data_block.store_value(self.data_block.key, value_block)
 260                         self.state = self.key_search
 261                     else:
 262                         self.state = self.block_search
 263                         self.blocks.append(self.data_block)
 264                         self.data_block = None
 265                 # KEY
 266                 else:
 267                     match = self.key_re.search(data, pos)
 268                     if match:
 269                         self.data_block.key = match.group()
 270                         self.state = self.delim_search
 271                         print "KEY:", self.data_block.key, 
 272                         pos = match.end()
 273                     else:
 274                         return
 275 
 276             # search DELIMITER
 277             if self.state == self.delim_search:
 278                 match = self.delim_re.match(data, pos)
 279                 if match:
 280                     pos = match.end(1)
 281                     if match.group(1)==self.delim_raw:
 282                         self.state=self.raw_html
 283                     elif match.group(1)==self.delim_quoted:
 284                         self.state=self.unquote_html
 285                     elif match.group(1)==self.delim_block:
 286                         self.state=self.block_search
 287                     print match.group(1),
 288                 else:
 289                     return
 290 
 291             # search VALUE/ ENDDELIMITER ;
 292             if self.state in [self.raw_html, self.unquote_html]:
 293                 match = self.delim_end.search(data, pos)
 294                 if match:
 295                     self.value.append(data[pos:match.start(0)])
 296                     pos = match.end(0)
 297                     value = "".join(self.value)
 298                     self.value = []
 299                     print value
 300                     self.data_block.store_value(self.data_block.key, value)
 301                     self.state = self.key_search
 302                 else:
 303                     self.value.append(data[pos:])
 304                     return
 305 
 306     def handle_entityref(self, name):
 307         if self.state==self.raw_html:
 308             self.value.append("&%s;" % name)
 309         elif self.state==self.unquote_html:
 310             self.value.append(htmlentitydefs.entitydefs[name])
 311         # XXX break keys?
 312 
 313     def handle_charref(self, name):
 314         if self.state==self.raw_html:
 315             self.value.append("&#%s;" % name)
 316             return
 317         
 318         if name[0]=="x":
 319             character = unichr(int(name[1:], 16))
 320         else:
 321             character = unichr(int(name))
 322         
 323 
 324         if self.state==self.value:
 325             self.value.append(character)
 326         # XXX encoded delimiter, key chars, ...
 327     
 328 if __name__ == "__main__":
 329 
 330     store = DataBlockStore()
 331     #mc = store.get_block("http://www.emacswiki.org/cw/CommunityWiki")
 332     #for blck in mc: pprint.pprint(blck)
 333     #mc = store.get_blocks("http://moinmoin.wikiwikiweb.de/MachineCodeBlocks3/MetaSchema")
 334     mc = store.get_blocks("http://moinmoin.wikiwikiweb.de/MachineCodeBlocks3/RdfIntegration")
 335     print "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"
 336     rdf = mcb_rdf.RDF_World()
 337     for blck in mc:
 338         pprint.pprint(blck)
 339         rdf.mcb_rdf(blck)
 340 
 341     #print rdf.store.serialize(format="pretty-xml")

Attached Files

To refer to attachments on a page, use attachment:filename, as shown below in the list of files. Do NOT use the URL of the [get] link, since this is subject to change and can break easily.
  • [get | view] (2005-07-03 19:19:59, 11.1 KB) [[attachment:machineblock4.py]]
  • [get | view] (2005-07-03 19:10:49, 4.8 KB) [[attachment:mcb_rdf.py]]
 All files | Selected Files: delete move to page copy to page

You are not allowed to attach a file to this page.