Attachment 'machineblock4.py'
Download 1 #!/usr/bin/python
2 """MachineCode interpreter.
3
4 Interprets and caches MachineCode from XHTML pages.
5
6 DataBlockStore -- Create DataBlock instances.
7 DataBlock -- A MachineCode block on a website.
8 """
9
10 import re
11
12 import pprint
13 import HTMLParser, htmlentitydefs
14
15 import webcache
16
17 import mcb_rdf
18
19 class DataBlockStore:
20
21 """Creates and caches data blocks.
22
23 When you want a DataBlock, call "get_block". It will return a cached
24 block, or a new block.
25
26 The DataBlockStore also maintains a shared webpage cache.
27
28 get_block -- Obtain a DataBlock
29 """
30
31 def __init__(self, directory = "."):
32 self.cache = webcache.WebCache(directory + "/pages.db",
33 directory + "/times.db",
34 24*60*60) # Shared webpage cache
35 self.blocks = {} # Blocks cache
36
37 def get_blocks(self, url, baseurl=None):
38 """Return a DataBlock representing a website's MachineCode.
39
40 If the DataBlock has already been made, return it. If not,
41 construct it, cache it, and then return it.
42 """
43 if not url.startswith("http://") and baseurl:
44 if url[0]=="#":
45 name = url[1:]
46 else:
47 name = url
48 url = baseurl
49 else:
50 name = None
51 url = url.split("#", 1)
52 if len(url)==2:
53 name = url[1]
54 url = url[0]
55
56 if not url in self.blocks:
57 self.read_page(url)
58
59 if name:
60 result = []
61 for block in self.blocks[url]:
62 if block.get("id", [""])[0]==name:
63 result.append(block)
64 return result
65 else:
66 return self.blocks[url]
67
68 def read_page(self, url):
69 page = self.cache.get_page(url)
70 parser = MachineCodeHtmlParser()
71 parser.feed(page)
72 parser.close()
73 self.blocks[url] = parser.blocks
74 for block in self.blocks[url]:
75 block._url = url
76 #block.apply_type(self)
77 #print block.get_required_attributes(self)
78
79 class DataBlock(dict):
80 strict_types = 0
81
82 def store_value(self, key, value):
83 if self.has_key(key):
84 self[key].append(value)
85 else:
86 self[key] = [value]
87
88 def get_required_attributes(self, type):
89
90 if not type: return []
91 attributes = type.get("attribute", [])
92 result = []
93 for attr in attributes:
94 if int(attr.get("required", [1])[0]):
95 if attr.has_key("key"):
96 result.append(attr["key"][0])
97 return result
98
99 def get_type(self, store):
100 type = self.get("type", None)
101 if not type: return None
102 type = type[0]
103 if isinstance(type, str):
104 type = store.get_blocks(type, self._url)
105 if not type: return None
106 type = type[0]
107 return type
108
109 def get_type_attribute(self, type, key):
110 for attr in type.get('attribute', []):
111 k = attr.get('key',[])
112 if k and k[0]==key: return attr
113 return None
114
115 def _handle_string(self, value):
116 if not isinstance(value, str) and self.strict_types:
117 return None
118 else:
119 return value
120 def _handle_number(self, value):
121 try:
122 return float(value)
123 except ValueError:
124 print value
125 if self.strict_types:
126 return None
127 else:
128 return value
129 def _handle_date(self, value):
130 return value #XXX
131 def _handle_url(self, value):
132 value = self._handle_string(value)
133 # XXX href replacement ._url extention
134 return value
135 def _handle_any(self, value):
136 return value
137
138 def _Handle_block(self, value, block_type, store):
139 if isinstance(value, DataBlock):
140 value._url = self._url
141 type = value.get_type(store)
142 if type is block_type:
143 return value
144 elif type is None:
145 value['type'] = [block_type]
146 elif self.strict_types:
147 return None
148 value.apply_type(store)
149 elif self.strict_types:
150 return None
151 return value
152
153
154 def apply_type(self, store):
155 type = self.get_type(store)
156 if not type: return
157 #req_type = self.get_required_attributes(type)
158 for key, values in self.iteritems():
159 attribute = self.get_type_attribute(type, key)
160 if attribute:
161 attr_type = attribute.get("type", ['any'])[0]
162 if key=='multiple': print "XX\n", self, '\n', attr_type
163 if isinstance(attr_type, str):
164 attr_type = attr_type.strip()
165 handler = getattr(self, "_handle_%s" % attr_type,
166 None)
167 if not handler:
168 type_block = store.get_blocks(attr_type, self._url)
169 if type_block:
170 type_block = type_block[0]
171 handler = lambda v: self._Handle_block(
172 v, type_block, store)
173 else:
174 handler = self._handle_any #XXX Blocks
175 elif isinstance(attr_type, DataBlock):
176 handler = lambda v: self._Handle_block(
177 v, attr_type, store)
178
179 values = map(handler, values)
180
181 if self.strict_types:
182 values = filter(lambda x:x is not None, values)
183 self[key] = values
184
185 class MachineCodeHtmlParser(HTMLParser.HTMLParser):
186
187 """Interpret XHTML tag events and store the results in a DataBlock.
188
189 A SAX-like handler for XHTML events.
190
191 The handler waits for the text "MACHINECODE" in bold.
192
193 Then it reads bold-tagged text as dictionary keys, and italic-tagged
194 text or anchored text as dictionary values.
195
196 A final "MACHINECODE" in bold seals the interpretation.
197 """
198
199 beginblock = "BEGINBLOCK"
200 endblock_re = re.compile("\s*ENDBLOCK", re.M)
201 delim_re = re.compile(r"\s*([:$%])")
202 delim_quoted = ":"
203 delim_raw = '%'
204 delim_block = "$"
205 delim_end = re.compile(";(?!;)")
206 key_re = re.compile(r"[\w-]+", re.M)
207 block_search = object()
208 key_search = object()
209 delim_search = object()
210 raw_html = object()
211 unquote_html = object()
212
213 def __init__(self):
214 HTMLParser.HTMLParser.__init__(self)
215
216 self.data_block = None
217
218 self.state = self.block_search
219 self.value = []
220 self.stack = []
221 self.blocks = []
222 def handle_starttag(self, tag, attrs):
223 if self.state==self.raw_html:
224 a = []
225 for key, value in attrs:
226 a.append(' %s="%s"' % (key, value))
227 self.value.append("<%s%s>" % (tag, "".join(a)))
228
229 def handle_endtag(self, tag):
230 if self.state==self.raw_html:
231 self.value.append("</%s>" % tag)
232
233 def handle_data(self, data):
234 pos = 0
235
236 while pos<len(data):
237 # search BEGINBLOCK
238 if self.state==self.block_search:
239 pos = data.find(self.beginblock, pos)
240 if pos != -1:
241 print "\nBEGINBLOCK"
242 pos += len(self.beginblock)
243 self.state = self.key_search
244 if self.data_block: self.stack.append(self.data_block)
245 self.data_block = DataBlock()
246 else:
247 return
248
249 # search KEY
250 if self.state==self.key_search:
251 # ENDBLOCK
252 match = self.endblock_re.match(data, pos)
253 if match:
254 print "ENDBLOCK"
255 pos = match.end()
256 if self.stack:
257 value_block = self.data_block
258 self.data_block = self.stack.pop()
259 self.data_block.store_value(self.data_block.key, value_block)
260 self.state = self.key_search
261 else:
262 self.state = self.block_search
263 self.blocks.append(self.data_block)
264 self.data_block = None
265 # KEY
266 else:
267 match = self.key_re.search(data, pos)
268 if match:
269 self.data_block.key = match.group()
270 self.state = self.delim_search
271 print "KEY:", self.data_block.key,
272 pos = match.end()
273 else:
274 return
275
276 # search DELIMITER
277 if self.state == self.delim_search:
278 match = self.delim_re.match(data, pos)
279 if match:
280 pos = match.end(1)
281 if match.group(1)==self.delim_raw:
282 self.state=self.raw_html
283 elif match.group(1)==self.delim_quoted:
284 self.state=self.unquote_html
285 elif match.group(1)==self.delim_block:
286 self.state=self.block_search
287 print match.group(1),
288 else:
289 return
290
291 # search VALUE/ ENDDELIMITER ;
292 if self.state in [self.raw_html, self.unquote_html]:
293 match = self.delim_end.search(data, pos)
294 if match:
295 self.value.append(data[pos:match.start(0)])
296 pos = match.end(0)
297 value = "".join(self.value)
298 self.value = []
299 print value
300 self.data_block.store_value(self.data_block.key, value)
301 self.state = self.key_search
302 else:
303 self.value.append(data[pos:])
304 return
305
306 def handle_entityref(self, name):
307 if self.state==self.raw_html:
308 self.value.append("&%s;" % name)
309 elif self.state==self.unquote_html:
310 self.value.append(htmlentitydefs.entitydefs[name])
311 # XXX break keys?
312
313 def handle_charref(self, name):
314 if self.state==self.raw_html:
315 self.value.append("&#%s;" % name)
316 return
317
318 if name[0]=="x":
319 character = unichr(int(name[1:], 16))
320 else:
321 character = unichr(int(name))
322
323
324 if self.state==self.value:
325 self.value.append(character)
326 # XXX encoded delimiter, key chars, ...
327
328 if __name__ == "__main__":
329
330 store = DataBlockStore()
331 #mc = store.get_block("http://www.emacswiki.org/cw/CommunityWiki")
332 #for blck in mc: pprint.pprint(blck)
333 #mc = store.get_blocks("http://moinmoin.wikiwikiweb.de/MachineCodeBlocks3/MetaSchema")
334 mc = store.get_blocks("http://moinmoin.wikiwikiweb.de/MachineCodeBlocks3/RdfIntegration")
335 print "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"
336 rdf = mcb_rdf.RDF_World()
337 for blck in mc:
338 pprint.pprint(blck)
339 rdf.mcb_rdf(blck)
340
341 #print rdf.store.serialize(format="pretty-xml")
Attached Files
To refer to attachments on a page, use attachment:filename, as shown below in the list of files. Do NOT use the URL of the [get] link, since this is subject to change and can break easily.You are not allowed to attach a file to this page.