Inc Scraping Functions | Python Fiddle

#Using the below functions, we can scrape the Inc. 5000 using:

import pandas as pd
import numpy as np
from copy import deepcopy
from bs4 import BeautifulSoup as BS
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
import codecs

#First, get the profile url for each company in the inc5000
browser = webdriver.Firefox()
browser.get('http://www.inc.com/inc5000/list/2014')
goodurls = []
for i in range(500):
    pagecycle(i+1)
    #We'll want to make one big list, so we'll just concatenate the new list instead of append
    goodurls += getgoodurls(browser.page_source)

#This should give us 5000 good urls. Now, we want to take the information from each of those pages
data = []
for url in goodurls:
    newdata = extractpage(browser,url)
    #This gives us a dict of the page's information, which we append to our data file
    data.append(newdata)

#Finally, once we have all of our data, since it's a list of dicts, we can throw into a Pandas DataFrame
inc5000 = pd.DataFrame(data)

#  ----------Functions-----------

def getgoodurls(page_source):
    #Once on a new page, need to find the page's URLs amidst all on the page
    soup = BS(page_source)
    #Find all forward links
    links = soup.find_all('a')
    urls = []
    #Remove JS formatting from the URLs
    for link in links:
        if link.has_attr('href'):
            urls.append(link['href'])
    #Looking at 10 profiles at a time, we take direct profiles as follows        
    goodurls = urls[74:114:4]
    return goodurls

def pagecycle(n):
    #Pattern for interfacing with inc5000 list
    #Can't directly access the arrow for next page or page index
    #Instead, we lock onto the go button then navigate to the number input
    element = browser.find_element_by_class_name('goButton')
    #use an action chain to move from the go button and enter the number 'n'
    actions = ActionChains(browser)
    actions.key_down(Keys.SHIFT)
    actions.send_keys(Keys.TAB)
    actions.key_up(Keys.SHIFT)
    actions.send_keys(Keys.BACKSPACE)
    actions.send_keys(Keys.BACKSPACE)
    actions.send_keys(Keys.BACKSPACE)
    actions.send_keys(str(n))
    actions.send_keys(Keys.RETURN)
    actions.perform()
    #Finally, return to the go button to go to the next page
    element = browser.find_element_by_class_name('goButton')
    element.click()

def urlcleaner(url):
    #quick function for removing formatting from url
    name = url[23::]
    return name

def extractpage(browser,pageurl):
    #function for taking information from inc profile
    browser.get(pageurl)
    #Use a dictionary to store information from the page
    #Collecting dictionaries will allow us to quickly build a DataFrame
    datadict = {}
    datadict['Company'] = browser.title
    soup = BS(browser.page_source)
    divs = soup.find_all('div')
    data = []
    #Information will be in divs tagged 'dtdd'
    for div in divs:
        if div.has_attr('class'):
            if div['class'] == ['dtdd']:
                data.append(div)
    #Take all individual information snippets
    for point in data:
        string = str(point.find_all('dt'))
        string2 = str(point.find_all('dd'))
        datadict[string[5:-6]] = string2[5:-6]
    return datadict

import string
def numclean(entry):
    #Remove all punctuation from the dollar amounts, keeping decimals
    punct = string.punctuation
    punct = punct.replace('.','')
    c = ''
    for digit in entry:
        if digit not in set(punct):
            c += digit
    return(float(c))

def dollarclean(entry):
    #strings are not mutable, need to turn dollar strings into numbers
    newentry = entry
    digits = ''
    for digit in entry:
        if digit in '0123456789':
            digits += digit
    newentry = digits + ('.' not in entry)*'0' + ('M' in entry)*'00000' \
               + ('B' in entry)*'00000000'
    return(newentry)

Python Fiddle

Python Cloud IDE