#!/usr/bin/env python ''' Format: "Id Restaurant Address CP Town Province Telephone Food Latitude Longitude" ''' import os, glob, re path = os.path.join(os.path.dirname(__file__), 'download', 'chequegourmet') re_lines = re.compile(r"(.*?)<\/tr>") re_fields = re.compile(r"(.*?)<\/td>") print "#Id Restaurant Address CP Town Province Telephone Food Latitude Longitude" for infile in glob.glob( os.path.join(path, '*.html') ): f = open(infile, 'r') source = "" for sourceline in f: source += " " + sourceline.strip() for line in re_lines.findall(source): fields = [] for field in re_fields.findall(line): fields.append(field.strip()) print "NULL\t" + fields[0] + "\t" + fields[1] + "\t" + fields[2] + "\t" + fields[5] + "\t" + fields[4] + "\t" + fields[3] + "\tNULL\tNULL\tNULL"