#!/usr/bin/env python
'''
Format:
"Id Restaurant Address CP Town Province Telephone Food Latitude Longitude"
'''
import os, glob, re
path = os.path.join(os.path.dirname(__file__), 'download', 'chequegourmet')
re_lines = re.compile(r"
(.*?)<\/tr>")
re_fields = re.compile(r"| (.*?)<\/td>")
print "#Id Restaurant Address CP Town Province Telephone Food Latitude Longitude"
for infile in glob.glob( os.path.join(path, '*.html') ):
f = open(infile, 'r')
source = ""
for sourceline in f:
source += " " + sourceline.strip()
for line in re_lines.findall(source):
fields = []
for field in re_fields.findall(line):
fields.append(field.strip())
print "NULL\t" + fields[0] + "\t" + fields[1] + "\t" + fields[2] + "\t" + fields[5] + "\t" + fields[4] + "\t" + fields[3] + "\tNULL\tNULL\tNULL"
|