''' import urllib2, re, time, pprint, csv # This regular expression extracts links to team rosters. reTeamData = re.compile('/players/search\?category=team&filter=([0-9]+)&playerType=current">([^<]+)') def getTeamList(): """ Returns a list of tuples where each tuple looks like (teamID, teamName) """ # Download the list of teams and return all matches to the regular reTeamData regular expression. return reTeamData.findall(urllib2.urlopen('http://www.nfl.com/players/search?category=team&playerType=current').read()) # This regular expression extracts a player's ESPN ID and their first and last names. rePlayerData = re.compile('profile\?id=([^"]+)">([^,]+), ([^<]+)') # This regular expression extracts the link to the "next" page of the team roster. reNextPageURL = re.compile('href="([^"]+)">next</a>') def getTeamPlayers(teamID): """ Return the list of players for a given team, where each player is (playerID, playerLastName, playerFirstName) """ # Download the first page of the team roster and store the list of players. teamPageHTML = urllib2.urlopen('http://www.nfl.com/players/search?category=team&filter=%s&playerType=current' % teamID).read() players = rePlayerData.findall(teamPageHTML) """ Check for a "next" page. If one is found, then download this "next" page and add the players on that page to the previous list. Continue checking for more pages and storing until no more pages are found. """ nextURL = reNextPageURL.findall(teamPageHTML) while len(nextURL) > 0: teamPageHTML = urllib2.urlopen('http://www.nfl.com' + nextURL[0].replace('&','&')).read() players.extend(rePlayerData.findall(teamPageHTML)) nextURL = reNextPageURL.findall(teamPageHTML) return players """ The following regular expressions extract the desired information from the player's profile page. """ reHeight = re.compile('Height: ([^ \r\n]+)') reWeight = re.compile('Weight: ([^ \r\n]+)') reAge = re.compile('Age: ([^ \r\n]+)') reCollege = re.compile('College: ([^<]+)') reName = re.compile('<title>([^<]+)') reTeam = re.compile('team=[^"]+">([^<]+)</a>') rePosition = re.compile('\| ([A-Z]{1,4})') def getPlayerInfo(playerID): """ Returns the player's info. """ try: pageData = urllib2.urlopen('http://www.nfl.com/players/profile?id=' + playerID).read() heightTokens = reHeight.findall(pageData)[0].split('-') height = int(heightTokens[0]) * 12 + int(heightTokens[1]) return {'name': reName.findall(pageData)[0], 'position': rePosition.findall(pageData)[0], 'height': height, 'weight': int(reWeight.findall(pageData)[0]), 'age': int(reAge.findall(pageData)[0]), 'college': reCollege.findall(pageData)[0], 'team': reTeam.findall(pageData)[0]} except: print 'Failed to load', playerID # Open the CSV file for output. csvFile = csv.writer(open('players.csv', 'w'), delimiter=',', quotechar='"') # Download the list of teams teams = getTeamList() # For each team, download the list of players for team in teams: print 'Retrieving players from the', team[1] players = getTeamPlayers(team[0]) # For each player, download their info and write it to the CSV file for player in players: playerInfo = getPlayerInfo(player[0]) if playerInfo: csvFile.writerow(playerInfo.values()) # Wait between each player time.sleep(0.1)
Run
Reset
Share
Import
Link
Embed
Language▼
English
中文
Python Fiddle
Python Cloud IDE
Follow @python_fiddle
Browser Version Not Supported
Due to Python Fiddle's reliance on advanced JavaScript techniques, older browsers might have problems running it correctly. Please download the latest version of your favourite browser.
Chrome 10+
Firefox 4+
Safari 5+
IE 10+
Let me try anyway!
url:
Go
Python Snippet
Stackoverflow Question