import zipfile import os import pandas as pd import StringIO import numpy as np import gzip """ Generate a single text data file from the National Bridge Inventory zip archives. The columns of the data file that is generated are: Column 1: State abbreviation Column 2: Bridge identifier Column 3: Year the bridge was constructed Columns 4-: Traffic levels The years of the traffic levels range from year0 to yearmax, inclusive (see below). When the traffic level was not observed, it is recorded as nan. This script takes several hours to run. """ ## Need to set this to a path that can be used to store temporary files tmp_path = "C:\Users\ahallo\Desktop\Current Work\National Bridge Inventory\pre2009BayArea" ## This year is taken as the time origin year0 = 1990 ## Discard data after this year yearmax = 2009 ## From http://www.fhwa.dot.gov/bridge/nbi/format.cfm State = [1, 3] Structure_id = [4, 18] Traffic = [165, 170] Traffic_year = [171, 174] Year_built = [157, 160] Latitude = [130, 137] Longitude = [138, 146] Year_reconstructed = [362, 365] item_5a = [19, 19] item_49 = [223, 228] item_112 = [374, 374] item_42a = [200, 200] colspecs = [State, Structure_id, Traffic, Traffic_year, Year_built, item_5a,\ item_49, item_112, item_42a, Year_reconstructed, Latitude, Longitude] ## Convert to zero-based indexing. for c in colspecs: c[0] -= 1 ## Use these column names for the columns in `colspecs`. colnames = {0: "State", 1: "Structure_id", 2: "Traffic", 3: "Traffic_year", 4: "Year_built", 5: "item_5a", 6: "item_49", 7: "item_112", 8: "item_42a", 9: "Year_reconstructed", 10: "Latitude", 11: "Longitude"} ## State codes ST = {"AL" : 14, "AK" : 20, "AZ" : 49, "AR" : 56, "CA" : 69, "CO" : 88, "CT" : 91, "DE" : 103, "DC" : 113, "FL" : 124, "GA" : 134, "HI" : 159, "ID" : 160, "IL" : 175, "IN" : 185, "IA" : 197, "KS" : 207, "KY" : 214, "LA" : 226, "ME" : 231, "MD" : 243, "MA" : 251, "MI" : 265, "MN" : 275, "MS" : 284, "MO" : 297, "MT" : 308, "NE" : 317, "NV" : 329, "NH" : 331, "NJ" : 342, "NM" : 356, "NY" : 362, "NC" : 374, "ND" : 388, "OH" : 395, "OK" : 406, "OR" : 410, "PA" : 423, "RI" : 441, "SC" : 454, "SD" : 468, "TN" : 474, "TX" : 486, "UT" : 498, "VT" : 501, "VA" : 513, "WA" : 530, "WV" : 543, "WI" : 555, "WY" : 568} ## Reverse the state codes map STR = {ST[k]: k for k in ST} def my_float(x): """Convert a string to a float, returning Nan if this fails.""" try: return float(x) except ValueError: return float("nan") ## TR[state][bridge] is a vector of annual traffic counts starting in year 1992. ## Missing data are denoted nan. TR = {} ## YB[state][bridge] is the year that bridge `bridge` in state `state` as built. YB = {} ## LatLon[state][bridge] is the location of bridge `bridge` in state `state`. LatLon = {} ## Loop over the zip archives L = os.listdir(tmp_path) L = [x for x in L if x.endswith(".zip")] for file in L: print "Processing " + file + ":" ## The path to the zip archive dpath = os.path.join(tmp_path, file) ## Open the zip archive fid = zipfile.ZipFile(dpath) ## Get a list of files in the archive IL = fid.infolist() ## Loop over the files, and process them one by one for ilj,il in enumerate(IL): print " %.0f%% " % np.floor(100*ilj/float(len(IL))) ## The name of a file in the archive fname = il.filename ## Read the entire file, and wrap it in a stringio D = fid.read(fname) sio = StringIO.StringIO(D) ## Put the data into a pandas data frame D = pd.read_fwf(sio, colspecs=colspecs, header=None) ## Give the columns useful names D.rename(columns=colnames, inplace=True) ## Require certain items to be numbers. for vn in "item_5a","Traffic": D[vn] = D[vn].map(my_float) D[vn] = D[vn].astype(np.float64) """The NBI database includes bridges that are not highway bridges. Here, we include only the highway bridges in our analysis. The following code accomplishes this selection.""" ## Drop any rows with missing values D = D.dropna() ## Loop over the records in a file for ix in D.index: ## The state for the current record try: state = STR[D.loc[ix,"State"]] except KeyError: continue ## The first time this state was encountered if state not in TR: TR[state] = {} YB[state] = {} LatLon[state] = {} ## The first time this bridge was encountered structure_id = D.loc[ix,"Structure_id"] ## The first time this structure was encountered if structure_id not in TR[state]: TR[state][structure_id] = [float("nan"),]*(yearmax - year0 + 1) ## Be prepared for non-paresable year data try: YB[state][structure_id] = float(D.loc[ix,"Year_built"]) except ValueError: YB[state][structure_id] = float("nan") lat = str(D.loc[ix,"Latitude"]) lon = str(D.loc[ix,"Longitude"]) latlon = lat + ";" + lon LatLon[state][structure_id] = latlon ## Store the traffic data if possible try: year = int(D.loc[ix,"Traffic_year"]) except ValueError: continue if year >= year0 and year <= yearmax: traffic = D.loc[ix,"Traffic"] try: TR[state][structure_id][year-year0] = traffic except ValueError: continue ## Save all the results with gzip.open("../Data/nbi_data.csv.gz", "w") as fid: fid.write("State,Structure id,Year built,Latitude,Longitude,") fid.write(",".join([str(y) for y in range(year0, yearmax+1)]) + "\n") ## We will write out the data for the states in alphabetical order States = TR.keys() States.sort() for state in States: ## Alphabetically sort the bridge id's, file easier to ## manually scan. K = TR[state].keys() K.sort() for k in K: ## Some states may be missing year of construction data yb = float("nan") try: yb = YB[state][k] except KeyError: pass latlon = "" try: latlon = LatLon[state][k] except KeyError: pass latlon = latlon.split(";") fid.write("%s,%s,%.0f,%s,%s," % (state, k, yb, latlon[0], latlon[1])) fid.write(",".join(["%.0f" % x for x in TR[state][k]]) + "\n")
Run
Reset
Share
Import
Link
Embed
Language▼
English
中文
Python Fiddle
Python Cloud IDE
Follow @python_fiddle
Browser Version Not Supported
Due to Python Fiddle's reliance on advanced JavaScript techniques, older browsers might have problems running it correctly. Please download the latest version of your favourite browser.
Chrome 10+
Firefox 4+
Safari 5+
IE 10+
Let me try anyway!
url:
Go
Python Snippet
Stackoverflow Question