from lxml import html import requests import csv def display_options(): print "Press 1 for scraping Starbucks" print "Press 2 for scraping Target" print "Press 3 for scraping Time Warner" print "Press 4 for scraping GiLead" print "Press 5 for scraping M&T Bank" print "Press 6 for scraping Amazon" display_options(); choice = input("Enter your choice: ") print "You selected %s as your choice" % (choice) def scrape_data(choice): if choice == 1: page = requests.get('https://news.starbucks.com/leadership') tree = html.fromstring(page.content) executive_names = tree.xpath('//*[@id="leaders"]/div/a/h2/text()') designations = tree.xpath('//*[@id="leaders"]/div/a/h3/text()') file_name = "starbucks_result" elif choice == 2: page = requests.get('https://corporate.target.com/press/leadership') tree = html.fromstring(page.content) executive_names = tree.xpath('//*[@id="aspnetForm"]/section[2]/div/article/div/span/a/h3/text()') designations = tree.xpath('//*[@id="aspnetForm"]/section[2]/div/article/div/span/h4/text()') file_name = "target_result" elif choice == 3: page = requests.get('http://www.timewarner.com/company/management/senior-corporate-executives') tree = html.fromstring(page.content) executive_names = tree.xpath('//*[@id="content"]/div/article/div/div/div/div/div/div/div/h4/a/text()') designations = tree.xpath('//*[@id="content"]/div/article/div/div/div/div/div/div/div/p/text()') file_name = "timewarner_result" elif choice == 4: page = requests.get('http://investors.gilead.com/phoenix.zhtml?c=69964&p=irol-govmanage') tree = html.fromstring(page.content) executive_names = tree.xpath('//tbody/tr[1]/td/span[1]/text()') designations = tree.xpath('//tbody/tr[1]/td/span[2]/text()') file_name = "gilead_result" elif choice == 5: page = requests.get('https://newsroom.mtb.com/leadership-team/') tree = html.fromstring(page.content) executive_names = tree.xpath('//*[@id="big_box"]/div/div/h2/a/text()') designations = tree.xpath('//*[@id="big_box"]/div/div/h3/text()') file_name = "mtb_result" elif choice == 6: page = requests.get('http://phx.corporate-ir.net/phoenix.zhtml?c=97664&p=irol-govmanage') tree = html.fromstring(page.content) executive_names = tree.xpath('//*[@id="thomson-reuters"]/table/tbody/tr/td[1]/table/tbody/tr[2]/td[1]/table[2]/tbody/tr/td/span[1]/a/text()') designations = tree.xpath('//*[@id="thomson-reuters"]/table/tbody/tr/td[1]/table/tbody/tr[2]/td[1]/table[2]/tbody/tr/td/span[2]/text()') file_name = "amazon_result" else: print "Your selected choice is invalid" return outfile = open("./%s.csv" %file_name,"wb") writer = csv.writer(outfile) writer.writerow(["Name", "Designation"]) list_of_result = [] for designation, name in zip(designations, executive_names): del list_of_result[:] list_of_result.append(designation) list_of_result.append(name) writer.writerows([list_of_result]) #print designations #print executive_name scrape_data(choice);
Run
Reset
Share
Import
Link
Embed
Language▼
English
中文
Python Fiddle
Python Cloud IDE
Follow @python_fiddle
Browser Version Not Supported
Due to Python Fiddle's reliance on advanced JavaScript techniques, older browsers might have problems running it correctly. Please download the latest version of your favourite browser.
Chrome 10+
Firefox 4+
Safari 5+
IE 10+
Let me try anyway!
url:
Go
Python Snippet
Stackoverflow Question