#!/usr/bin/env python ''' Format: "Id Restaurant Address CP Town Province Telephone Food Latitude Longitude" ''' import os, glob, re path = os.path.join(os.path.dirname(__file__), 'restaurants', '*.html') def remove_tabs(str): return replace(str, "\t", " ") print "#Id\tRestaurant\tAddress\tCP\tTown\tProvince\tTelephone\tFood\tLatitude\tLongitude" for infile in glob.glob(path): id = infile.split("/")[-1].split(".")[0] lines = [] for line in open(infile, 'r'): lines.append(line.strip()) name = lines[4].split("<")[0].strip() addr = lines[5].split(">", 1)[1].strip() cp = lines[8].split(">", 1)[1].split("<")[0].strip() city = lines[9].split(">", 1)[1].split("<")[0].strip() tele = lines[10].split(">", 1)[1].split("<")[0].strip() type = lines[12].strip() result = [id, name, addr, cp, city, "NULL", tele, type, "NULL", "NULL"] print "\t".join(result)