#!/usr/bin/env python # -*- coding: utf-8 -*- from web import db, SQLLiteral, config import sys import re import string try: import config except ImportError: print "Please create a config file first - see README.txt" exit() # wdb for writes wdb = db.database( dbn = config.dbtype, host = config.dbhost, db = config.dbname, user = config.dbuser, passwd = config.dbpass, use_unicode = True, charset = 'utf8') r_iso = re.compile('([\x80-\xFF])') def iso2utf(s): """Convert iso-8859-1 to utf-8. Sean B. Palmer.""" def conv(m): c = m.group(0) return ('\xC2'+c, '\xC3'+chr(ord(c) - 64))[ord(c) > 0xBF] return r_iso.sub(conv, s) def lower_case(str): str = str.lower() str = str.replace("Ñ", "ñ") str = str.replace("Ç", "ç") str = str.replace("Á", "á") str = str.replace("É", "é") str = str.replace("Í", "í") str = str.replace("Ó", "ó") str = str.replace("Ú", "ú") return str def normalise_fields(fields): newfields = [] for field in fields: #field = iso2utf(field.strip()) if field=="NULL": newfields.append("") else: newfields.append(field.strip().decode('iso-8859-1').encode('utf-8')) return newfields def convert_entities(str): str = str.replace("&", "&") str = str.replace(" ", " ") str = str.replace("Ñ", "ñ") return str def normalise_name(name): name = lower_case(name) name = convert_entities(name) if len(name) > 4: if name[-4:]==', el': name = 'el ' + name[0:-4] if name[-4:]==', la': name = 'la ' + name[0:-4] name = string.capwords(name) return name def normalise_address(address): address = lower_case(address) address = convert_entities(address) return address def import_tab(vendor, file): for line in open(file,'r'): if line[0] == "#": continue fields = line.split("\t") fields = normalise_fields(fields) #Restaurant Address CP Telephone Province Town wdb.insert( 'raw', vendor=vendor, res_id=fields[0], name=normalise_name(fields[1]), address=normalise_address(fields[2]), cp=fields[3], town=fields[4], province=fields[5], phone=fields[6], food=fields[7], latitude=fields[8], longitude=fields[9], created=SQLLiteral("NOW()"), updated=SQLLiteral("NOW()") ) vendors = ('chequegourmet', 'mobiticket', 'sodexo') if __name__=="__main__": wdb.query("TRUNCATE table raw"); for vendor in vendors: file = "".join(["output/", vendor, ".tab"]) import_tab(vendor, file)