NBI parse for MTC | Python Fiddle

import zipfile
import os
import pandas as pd
import StringIO
import numpy as np
import gzip

"""
Generate a single text data file from the National Bridge Inventory
zip archives.

The columns of the data file that is generated are:

Column 1: State abbreviation
Column 2: Bridge identifier
Column 3: Year the bridge was constructed
Columns 4-: Traffic levels

The years of the traffic levels range from year0 to yearmax, inclusive (see below).

When the traffic level was not observed, it is recorded as nan.

This script takes several hours to run.
"""

## Need to set this to a path that can be used to store temporary files
tmp_path = "C:\Users\ahallo\Desktop\Current Work\National Bridge Inventory\pre2009BayArea"

## This year is taken as the time origin
year0 = 1990

## Discard data after this year
yearmax = 2009

## From http://www.fhwa.dot.gov/bridge/nbi/format.cfm
State = [1, 3]
Structure_id = [4, 18]
Traffic = [165, 170]
Traffic_year = [171, 174]
Year_built = [157, 160]
Latitude = [130, 137]
Longitude = [138, 146]
Year_reconstructed = [362, 365]
item_5a = [19, 19]
item_49 = [223, 228]
item_112 = [374, 374]
item_42a = [200, 200]

colspecs = [State, Structure_id, Traffic, Traffic_year, Year_built, item_5a,\
            item_49, item_112, item_42a, Year_reconstructed, Latitude,
            Longitude]

## Convert to zero-based indexing.
for c in colspecs:
    c[0] -= 1

## Use these column names for the columns in `colspecs`.
colnames = {0: "State", 1: "Structure_id", 2: "Traffic", 3: "Traffic_year",
            4: "Year_built", 5: "item_5a", 6: "item_49", 7: "item_112",
            8: "item_42a", 9: "Year_reconstructed", 10: "Latitude",
            11: "Longitude"}

## State codes
ST = {"AL" : 14, "AK" : 20, "AZ" : 49, "AR" : 56, "CA" : 69, "CO" :
      88, "CT" : 91, "DE" : 103, "DC" : 113, "FL" : 124, "GA" : 134,
      "HI" : 159, "ID" : 160, "IL" : 175, "IN" : 185, "IA" : 197, "KS"
      : 207, "KY" : 214, "LA" : 226, "ME" : 231, "MD" : 243, "MA" :
      251, "MI" : 265, "MN" : 275, "MS" : 284, "MO" : 297, "MT" : 308,
      "NE" : 317, "NV" : 329, "NH" : 331, "NJ" : 342, "NM" : 356, "NY"
      : 362, "NC" : 374, "ND" : 388, "OH" : 395, "OK" : 406, "OR" :
      410, "PA" : 423, "RI" : 441, "SC" : 454, "SD" : 468, "TN" : 474,
      "TX" : 486, "UT" : 498, "VT" : 501, "VA" : 513, "WA" : 530, "WV"
      : 543, "WI" : 555, "WY" : 568}

## Reverse the state codes map
STR = {ST[k]: k for k in ST}

def my_float(x):
    """Convert a string to a float, returning Nan if this fails."""
    try:
        return float(x)
    except ValueError:
        return float("nan")
    
## TR[state][bridge] is a vector of annual traffic counts starting in year 1992.
## Missing data are denoted nan.
TR = {}

## YB[state][bridge] is the year that bridge `bridge` in state `state` as built.
YB = {}

## LatLon[state][bridge] is the location of bridge `bridge` in state `state`.
LatLon = {}

## Loop over the zip archives
L = os.listdir(tmp_path)
L = [x for x in L if x.endswith(".zip")]
for file in L:

print "Processing " + file + ":"

## The path to the zip archive 
    dpath = os.path.join(tmp_path, file)

## Open the zip archive
    fid = zipfile.ZipFile(dpath)

## Get a list of files in the archive
    IL = fid.infolist()

## Loop over the files, and process them one by one
    for ilj,il in enumerate(IL):

print "    %.0f%% " % np.floor(100*ilj/float(len(IL)))

## The name of a file in the archive
        fname = il.filename

## Read the entire file, and wrap it in a stringio
        D = fid.read(fname)
        sio = StringIO.StringIO(D)

## Put the data into a pandas data frame
        D = pd.read_fwf(sio, colspecs=colspecs, header=None)

## Give the columns useful names
        D.rename(columns=colnames, inplace=True)

## Require certain items to be numbers.
        for vn in "item_5a","Traffic":
            D[vn] = D[vn].map(my_float)
            D[vn] = D[vn].astype(np.float64)

"""The NBI database includes bridges that are not highway bridges. 
Here, we include only the highway bridges in our analysis. 
The following code accomplishes this selection."""

## Drop any rows with missing values
        D = D.dropna()
        
         ## Loop over the records in a file
        for ix in D.index:

## The state for the current record
            try:
                state = STR[D.loc[ix,"State"]]
            except KeyError:
                continue

## The first time this state was encountered
            if state not in TR:
                TR[state] = {}
                YB[state] = {}
                LatLon[state] = {}

## The first time this bridge was encountered
            structure_id = D.loc[ix,"Structure_id"]

## The first time this structure was encountered
            if structure_id not in TR[state]:
                TR[state][structure_id] = [float("nan"),]*(yearmax - year0 + 1)

## Be prepared for non-paresable year data
                try:
                    YB[state][structure_id] = float(D.loc[ix,"Year_built"])
                except ValueError:
                    YB[state][structure_id] = float("nan")

lat = str(D.loc[ix,"Latitude"])
                lon = str(D.loc[ix,"Longitude"])
                latlon = lat + ";" + lon
                LatLon[state][structure_id] = latlon

## Store the traffic data if possible
            try:
                year = int(D.loc[ix,"Traffic_year"])
            except ValueError:
                continue
            if year >= year0 and year <= yearmax:
                traffic = D.loc[ix,"Traffic"]
                try:
                    TR[state][structure_id][year-year0] = traffic
                except ValueError:
                    continue
## Save all the results
with gzip.open("../Data/nbi_data.csv.gz", "w") as fid:

fid.write("State,Structure id,Year built,Latitude,Longitude,")
    fid.write(",".join([str(y) for y in range(year0, yearmax+1)]) + "\n")

## We will write out the data for the states in alphabetical order
    States = TR.keys()
    States.sort()

for state in States:

## Alphabetically sort the bridge id's, file easier to
        ## manually scan.
        K = TR[state].keys()
        K.sort()

for k in K:

## Some states may be missing year of construction data
            yb = float("nan")
            try:
                yb = YB[state][k]
            except KeyError:
                pass

latlon = ""
            try:
                latlon = LatLon[state][k]
            except KeyError:
                pass

latlon = latlon.split(";")
            fid.write("%s,%s,%.0f,%s,%s," % (state, k, yb, latlon[0], latlon[1]))
            fid.write(",".join(["%.0f" % x for x in TR[state][k]]) + "\n")

Python Fiddle

Python Cloud IDE