import urllib import csv # legislative council, legislative assembly mlc_url = "http://www.parliament.vic.gov.au/members/list?f=csv&labels=EO-DL14&d=h&d=s&d=1&d=0&d=e&d=p&d=m&d=c&d=a&d=t&d=M&party=ALP&party=GRN&party=Ind&party=LP&party=NAT&house=MLC&gender=&port=&up=YYYY-MM-DD" mla_url = "http://www.parliament.vic.gov.au/members/list?f=csv&labels=EO-DL14&d=h&d=s&d=1&d=0&d=e&d=p&d=m&d=c&d=a&d=t&d=M&party=ALP&party=GRN&party=Ind&party=LP&party=NAT&house=MLA&gender=&port=&up=YYYY-MM-DD" # senate, house of reps senators_url = "http://www.aph.gov.au/~/media/03%20Senators%20and%20Members/Address%20Labels%20and%20CSV%20files/allsenel.ashx" house_url = "http://www.aph.gov.au/~/media/03%20Senators%20and%20Members/Address%20Labels%20and%20CSV%20files/SurnameRepsCSV.ashx" input_csv = "mps_downloaded.csv" output_csv = "MPs_list.csv" mp_types = [{ 'type': 'mlc', 'url': mlc_url, 'tag_list': "Politician, Victoria, Legislative Council"}, { 'type': 'mla', 'url': mla_url, 'tag_list': "Politician, Victoria, Legislative Assembly"}, { 'type': 'senate', 'url': senators_url, 'tag_list': "Politician, Federal, Senate"}, { 'type': 'house', 'url': house_url, 'tag_list': "Politician, Federal, House of Representatives"}] # don't know the parliament codes for some parties so the keys might need to change party_convert = {'GRN': 'G', 'ALP': 'L', 'LP': 'B', 'Nats': 'N', 'NAT': 'N', 'FAM': 'F', 'DEM': 'D', 'SEX': 'S', 'KAT': 'K', 'PIR': 'P', 'SOC': 'S', 'AG': 'O', 'CLP': 'O', 'DLP': 'O', 'Ind.': 'O','OTH': 'O' } columns = [ { 'column': 'prefix', 'senate': 1, 'house': 5, 'mlc': -1, 'mla': -1 }, { 'column': 'first_name', 'senate': 4, 'house': 3, 'mlc': 4, 'mla': 4 }, { 'column': 'last_name', 'senate': 0, 'house': 0, 'mlc': 5, 'mla': 5 }, { 'column': 'suffix', 'senate': 6, 'house': 7, 'mlc': -1, 'mla': -1 }, { 'column': 'party', 'senate': 9, 'house': 11, 'mlc': 7, 'mla': 7 }, { 'column': 'suite', 'senate': -1, 'house': -1, 'mlc': -1, 'mla': -1 }, { 'column': 'address1', 'senate': 22, 'house': 14, 'mlc': 9, 'mla': 9 }, { 'column': 'city', 'senate': 23, 'house': 15, 'mlc': -1, 'mla': -1 }, { 'column': 'state', 'senate': 24, 'house': 16, 'mlc': -1, 'mla': -1 }, { 'column': 'postcode', 'senate': 25, 'house': 17, 'mlc': -1, 'mla': -1 }, { 'column': 'phone_number', 'senate': 15, 'house': 18, 'mlc': 10, 'mla': 10 }, { 'column': 'email', 'senate': -1, 'house': -1, 'mlc': 11, 'mla': 11 }, { 'column': 'church', 'senate': -1, 'house': 13, 'mlc': 6, 'mla': 6 }, { 'column': 'religion', 'senate': 26, 'house': 21, 'mlc': 8, 'mla': 8 }, { 'column': 'tag_list', 'senate': -1, 'house': -1, 'mlc': -1, 'mla': -1 } ] def get_index_by_column_name(column_name): for i in range(len(columns)): if columns[i]['column'] == column_name: return i return None def get_mp_type_index_by_column_name(column_name, mp_type): for i in range(len(columns)): if columns[i]['column'] == column_name: return columns[i][mp_type] return None # create new csv we can use to import to nb with open(output_csv,"wb") as dest: writer= csv.writer(dest) # write nb column headings writer.writerow(tuple([ column['column'] for column in columns])) for mp_type in mp_types: # download their csv from website urllib.urlretrieve (mp_type['url'], input_csv) # process downloaded csv to conform to nb with open(input_csv,"rb") as src: reader = csv.reader(src) for field in reader: if reader.line_num == 1: continue #skip first line row = [] for column in columns: field_index = column[mp_type['type']] if field_index != -1: row.append(field[field_index]) else: row.append('') # fix city bug federal only if mp_type['type'] in ['senate', 'house']: city_index = get_index_by_column_name('city') if len(row[city_index]) == 0: address_index = get_mp_type_index_by_column_name('address1', mp_type['type']) address_parts = field[address_index].split(',') if len(address_parts) > 1: row[city_index] = address_parts.pop().strip() # last address_index = get_index_by_column_name('address1') row[address_index] = ",".join(address_parts).strip() # vic mps only fix prefix, some addresses over two cols, missing email if mp_type['type'] in ['mla', 'mlc']: # create prefix from hon and title fields row[get_index_by_column_name('prefix')] = (field[2] + " " + field[3]).strip() # address goes over two columns if field[4] == 'David' and field[5] == "O'Brien": row[get_index_by_column_name('address1')] = '2nd Floor, 1 Yarra Street La Cabine, Geelong 3220' row[get_index_by_column_name('phone_number')] = field[11] row[get_index_by_column_name('email')] = field[12] if field[4] == 'Hugh' and field[5] == "Delahunty": row[get_index_by_column_name('address1')] = '114 Firebrace Street, Horsham 3400' row[get_index_by_column_name('phone_number')] = '(03) 5382 0097' # handle case when input csv is missing the email address if row[get_index_by_column_name('email')] == '': row[get_index_by_column_name('email')] = "%s.%s@parliament.vic.gov.au" % (field[4].lower(), field[5].lower()) # fix party party_index = get_index_by_column_name('party') if row[party_index] in party_convert: row[party_index] = party_convert[row[party_index]] else: row[party_index] = 'O' # add tag list tag_list_index = get_index_by_column_name('tag_list') row[tag_list_index] = mp_type['tag_list'] # fix suite, federal only if mp_type['type'] in ['senate', 'house']: suite_index = get_index_by_column_name('suite') address_index = get_mp_type_index_by_column_name('address1', mp_type['type']) address_parts = field[address_index].split(',') suite_list = ["suite", "shop", "office", "level", "floor", "unit"] for substr in address_parts[0].lower().split(' '): if len(address_parts) > 1 and substr in suite_list: row[suite_index] = address_parts.pop(0).strip() # first item of address is suite/shop/office address_index = get_index_by_column_name('address1') row[address_index] = ",".join(address_parts).strip() else: address_index = get_index_by_column_name('address1') address = row[address_index] row[get_index_by_column_name('postcode')] = address[-4:] address_parts = address[:-5].split(',') row[get_index_by_column_name('city')] = address_parts.pop().strip() row[address_index] = address_parts.pop().strip() if len(address_parts) > 0: row[get_index_by_column_name('suite')] = " ".join(address_parts) row[get_index_by_column_name('suite')] = ' '.join(row[get_index_by_column_name('suite')].split()) #remove excess whitespace # preferred name if mp_type['type'] in ['senate']: if len(field[4].strip()) > 0: row[get_index_by_column_name('first_name')] = field[4] else: row[get_index_by_column_name('first_name')] = field[2] print "First name set to: %s" % row[get_index_by_column_name('first_name')] if mp_type['type'] in ['house']: if len(field[3].strip()) > 0: row[get_index_by_column_name('first_name')] = field[3] else: row[get_index_by_column_name('first_name')] = field[1] print "First name set to: %s" % row[get_index_by_column_name('first_name')] if mp_type['type'] in ['mla', 'mlc']: row[get_index_by_column_name('state')] = "VIC" # write row writer.writerow( tuple(row) ) print "Created %s" % output_csv
Run
Reset
Share
Import
Link
Embed
Language▼
English
中文
Python Fiddle
Python Cloud IDE
Follow @python_fiddle
Browser Version Not Supported
Due to Python Fiddle's reliance on advanced JavaScript techniques, older browsers might have problems running it correctly. Please download the latest version of your favourite browser.
Chrome 10+
Firefox 4+
Safari 5+
IE 10+
Let me try anyway!
url:
Go
Python Snippet
Stackoverflow Question