# -*- coding: utf-8 -*- # File: download_data.py # Description: Download data from ESPN's NBA database # Author: # Email: from urllib.request import urlopen from bs4 import BeautifulSoup import re import pandas as pd import time ############################################################################### # UTILITY FUNCTIONS ############################################################################### def BS(html): return BeautifulSoup(html, "lxml") def soup_link(link): content = urlopen(link).read() soup = BS(content) return soup def parse_distance(string): pp = "r'[0-9]*-foot jumper'" result = re.search(pp, string) if result != " ": dist = re.search(r'[0-9]+', string) return dist.group() else: return None def parse_points_scored(string): if "three point" in string: return 3 elif "free throw" in string: return 1 else: return 2 def parse_shot_success(string): if "makes" in string: return "scores" elif ("blocks" or "misses") in string: return "misses" else: return "not a shot" def get_player_in_action(string, list_players): for player in list_players: begin_pattern = '([A-Z][a-z]+\s+[A-Z][a-z]+)' if begin_pattern.search(string): return player if "blocks" in string: pattern = '([A-Z][a-z]+\s+[A-Z][a-z]+)' trimmed_string = string.find('blocks') index = trimmed_string trimmed_string = trimmed_string[index:] trimmed_string = trimmed_string[1:] if pattern.search(trimmed_string): return player return "Not in team" ############################################################################### # SCRAPING FUNCTIONS ############################################################################### def get_players(team_id, year): link = "http://espn.go.com/nba/team/stats/_/name/{}/year/{}/".format(team_id, year) soup = soup_link(link) game_stat_row = soup.findAll("tr", {"class": "colhead"}) player_names = [] for name in game_stat_row: if name == "Totals...": player_names.append() player_names.append(name) return player_names def extract_gamelog_row(row, list_players, home_or_away): pieces = list(row.children) info_dict = {} if len(pieces) < 3: return info_dict if home_or_away == "away": game_event = pieces[1] else: game_event = pieces[3] text = game_event.get_text() info_dict["text"] = text info_dict["player"] = get_player_in_action(text,list_players) info_dict["shot_success"] = parse_shot_success(text) if bool(parse_points_scored(text) == "scores"): info_dict["points_scored"] = parse_points_scored(text) else: info_dict["points_scored"] = 0 # distance info_dict["distance"] = parse_distance(text) # certain shot types are assigned a fixed distance if not(info_dict["distance"]): if "dunk" in text: info_dict["distance"] = 0 if "layup" in text: info_dict["distance"] = 1 if "tip shot" in text: info_dict["distance"] = 1 if "three point" in text: info_dict["distance"] = 23 return info_dict def structure_gamelog_info(game_id, list_players, home_or_away): """ Parse information from gamelog. Inputs: game_id: string, the id of the game list_players: a list of strings, the players of the team home_or_away: string, "home" or "away" Returns: Pandas data frame that contains information from the gamelog's rows """ game_link = "http://espn.go.com/nba/playbyplay?gameId={}&period=0".format(game_id) gamelog = soup_link(game_link) report_lines = gamelog.findAll("tr",{"class":"even"}) # fill in here list_info_dicts = [] for line in report_lines: info_dict = {} # filter lines that are not meaningful or relevant if bool(len(info_dict) == 0): continue if line in list_players: continue info_dict["game_id"] = game_id # fill in here: add info_dict to list_info_dicts list_info_dicts.append(info_dict) df = pd.DataFrame(list_info_dicts) return df def parse_all_games(df_game_ids, list_players): list_dfs = [] for ix, row in df_game_ids.iterrows(): game_id = row["game_id"] home_or_away = row["home_or_away"] print(game_id) df = structure_gamelog_info(game_id, list_players, home_or_away) list_dfs.append(df) time.sleep(1) # wait for one second df_all = pd.concat(list_dfs) return df_all ############################################################################### # RUN THE SCRAPING ############################################################################### # ---- define parameters team_id = "por" year = 2015 DATA_FOLDER = "../data" # ---- get the players list_players = get_players(team_id, year) # ---- scrape one game for testing purposes GAME_ID = 400579510 home_or_away = "away" df_game = structure_gamelog_info(GAME_ID, list_players, home_or_away) # sum the points scored. Is it really the same that the team scored in the game? # ---- scrape all games # -------- read the game ids df_game_ids = pd.read_csv(DATA_FOLDER + "/gameids_{}_{}.csv".format(team_id, year)) # -------- run the parser df_all = parse_all_games(df_game_ids, list_players) # -------- output the result df_all.to_csv(DATA_FOLDER + "/all_games_{}_{}.csv".format(team_id, year), index = False)
Run
Reset
Share
Import
Link
Embed
Language▼
English
中文
Python Fiddle
Python Cloud IDE
Follow @python_fiddle
Browser Version Not Supported
Due to Python Fiddle's reliance on advanced JavaScript techniques, older browsers might have problems running it correctly. Please download the latest version of your favourite browser.
Chrome 10+
Firefox 4+
Safari 5+
IE 10+
Let me try anyway!
url:
Go
Python Snippet
Stackoverflow Question