#!/usr/bin/env python import pdb #pdb.set_trace() import urllib2 from lxml import etree import csv import time import unicodedata import math from datetime import datetime as dt import datetime from itertools import groupby import os import random import json import gspread from oauth2client.client import SignedJwtAssertionCredentials #os.chdir('/root/working_scripts') def get_rank(tree): '''Get the Sales Rank of the Product''' ranks = [] rank = tree.xpath('//li[@id="SalesRank"]/text()') for rr in rank: if rr.strip() != "": ranks.append(rr.strip()) if len(ranks) != 0: ranks = ranks[0][1:ranks[0].find("in")].strip() else: ranks = "No Ranking" return ranks def get_author(tree): '''Get the author who wrote the review''' author = [] authors = tree.xpath('//div[@class="a-row"]/span/a/text()')[2:] for aa in authors: author.append(aa) return author def get_author_page(tree): '''Get the author profile url who wrote the review''' url = [] urls = tree.xpath('//div[@class="a-row"]/span/a/@href')[2:] for uu in urls: url.append("http://www.amazon.com" + uu) return url def get_review_rank(tree): '''Get start ranking of review''' review_rank = [] ranks = tree.xpath('//span[@class="a-icon-alt"]/text()')[2:] for rr in ranks: review_rank.append(rr) return review_rank def get_date(tree): '''Get the date the review was written''' dates = [] date = tree.xpath('//span[@class="a-size-base a-color-secondary review-date"]/text()')[2:] for dd in date: dd = dd[2:].strip() dates.append(dd) return dates def get_product(ss): '''Get the product of the review''' index = ss.find(".com/") + 5 end_index = ss.find("/", index) product = ss[index:end_index] return product def get_asin(ss): '''Get the asin of the product''' index = ss.find("dp") asin = ss[index+3:] return asin def get_title(tree): '''Get the title of the review''' title = [] titles = tree.xpath('//a[@class="a-size-base a-link-normal review-title a-color-base a-text-normal a-text-bold"]/text()') for tt in titles: title.append(tt) return title def unicode_to_string(types): '''Cast unicode to string to print in csv function''' try: types = unicodedata.normalize("NFKD", types).encode('ascii', 'ignore') return types except: return types def remove_duplicates(combos): '''Remove duplicates''' seen = [] unique = [] for cc in combos: if cc not in seen: seen.append(cc) unique.append(cc) return unique def get_links_ranking(ss, page_index, ranks): '''Get the links of the page rank''' data = [] index = ss.find("s/") + 2 part_url = ss[:index] + "ref=sr_pg_" + str(page_index) + "?&page=" + str(page_index) key_word_index = ss.find("&keywords") url = part_url + ss[key_word_index:] responses = urllib2.urlopen(url).read() trees = etree.HTML(responses) links = trees.xpath('//div[@class="a-row a-spacing-small"]//a[@class="a-link-normal s-access-detail-page a-text-normal"]/@href') for ll in links: data.append([ll, ranks]) ranks+=1 page_index+=1 return data, page_index, ranks def get_num_pages(tree): '''Get number of pages to cycle through''' num = tree.xpath('//span[@class="a-size-medium totalReviewCount"]//text()')[0] if num.find(",") != -1: num = num.replace(",", "") num_pages = math.ceil(int(num)/10.00) return num_pages def get_rank_product_asin(trees, s): '''Get the Rank, Product, ASIN feeder function''' rank = get_rank(trees) product = get_product(s) asin = get_asin(s) return rank, product, asin def scrape_review_page(rr): '''Scrape the review page return the LXML tree for xpaths''' time.sleep(3) review_response = urllib2.urlopen(rr).read() tree = etree.HTML(review_response) return tree def get_keywords(): '''Get the keywords text file to start forming urls''' keywords = [] f = open('group6.txt', 'r') for line in f: lines = line.split(",") groups = lines[1].strip() groups = '+'.join(groups.split()) keywords.append([lines[0], groups]) start_urls = get_start_url(keywords) return start_urls def get_start_url(keywords): '''Using keywords make all of the urls''' start_urls = [] for kk in keywords: word_split = kk[0].split() if len(word_split) == 1: start_urls.append(["http://www.amazon.com/s/&keywords=" + kk[0], kk[1]]) else: query = "" for ww in word_split: query += ww + "+" query = query[:-1] start_urls.append(["http://www.amazon.com/s/&keywords=" + query, kk[1]]) return start_urls def get_review_link(tree): lnk = [] links = tree.xpath('//a[@class="a-size-base a-link-normal review-title a-color-base a-text-normal a-text-bold"]/@href') for lk in links: linkss = "www.amazon.com" + lk[:lk.find("?")] lnk.append(linkss) return lnk def get_brand(tree): return tree.xpath('//div[@class="a-row product-by-line"]//a[@class="a-size-base a-link-normal"]/text()')[0] def write_spreadsheet(data): json_key = json.load(open('keys.json')) scope = ['https://spreadsheets.google.com/feeds'] credentials = SignedJwtAssertionCredentials(json_key['client_email'], json_key['private_key'], scope) gc = gspread.authorize(credentials) wks = gc.open("historic").sheet1 row = len(wks.get_all_values()) + 1 for dat in data: col = 1 for dd in dat: try: wks.update_cell(row, col, dd) col+=1 except: print "Error Inputting Value" col+=1 print "Inserting Row #: " + str(row) + "/" + str(len(data)) row+=1 def write_csv(data): '''Write all of data to a CSV''' print "Writing Templates to CSV File..." with open("product_historical_test.csv", 'a') as f: writer = csv.writer(f) writer.writerows(data) def write_missed(data): '''Write all of data to a CSV''' print "Writing Templates to CSV File..." with open("missed.csv", 'a') as f: writer = csv.writer(f) writer.writerows(data) def main(): start_urls = get_keywords() for ss in start_urls: datas = [] key_word = ss[0][ss[0].find("=")+1:] page_index = 1 ranks = 1 while(page_index < 3): print "Getting Product Links to Scrape #: " + str(page_index) try: data, page_index, ranks = get_links_ranking(ss[0], page_index, ranks) datas.extend(data) except: print "Skipping Product Link #: " + str(page_index) looped = 1 for dd in datas: missed = [] print "Links Looping through: " + str(looped) + "/" + str(len(datas)) reviews = [] num_pages = 0 try: time.sleep(random.randint(0,7)) responses = urllib2.urlopen(dd[0]).read() trees = etree.HTML(responses) rank, product, asin = get_rank_product_asin(trees, dd[0]) time.sleep(random.randint(0,7)) review_page = dd[0].replace("dp", "product-reviews") page_response = urllib2.urlopen(review_page).read() tree = etree.HTML(page_response) num_pages = get_num_pages(tree) if num_pages == 0.0: num_pages = 1 i = 1 while(i <= num_pages): rr = review_page + "/ref=cm_cr_pr_btm_link_" + str(i) + "?&pageNumber=" + str(i) reviews.append(rr) i+=1 except: try: responses = urllib2.urlopen(dd[0]).read() trees = etree.HTML(responses) rank, product, asin = get_rank_product_asin(trees, dd[0]) time.sleep(2) review_page = dd[0].replace("dp", "product-reviews") page_response = urllib2.urlopen(review_page).read() tree = etree.HTML(page_response) num_pages = get_num_pages(tree) if num_pages == 0.0: num_pages = 1 while(i <= num_pages): rr = review_page + "/ref=cm_cr_pr_btm_link_" + str(i) + "?&pageNumber=" + str(i) reviews.append(rr) i+=1 except: try: time.sleep(random.randint(0,7)) responses = urllib2.urlopen(dd[0]).read() trees = etree.HTML(responses) rank, product, asin = get_rank_product_asin(trees, dd[0]) time.sleep(random.randint(0,7)) review_page = dd[0].replace("dp", "product-reviews") page_response = urllib2.urlopen(review_page).read() tree = etree.HTML(page_response) num_pages = get_num_pages(tree) if num_pages == 0.0: num_pages = 1 i = 1 while(i <= num_pages): rr = review_page + "/ref=cm_cr_pr_btm_link_" + str(i) + "?&pageNumber=" + str(i) reviews.append(rr) i+=1 except: try: time.sleep(random.randint(0,7)) responses = urllib2.urlopen(dd[0]).read() trees = etree.HTML(responses) rank, product, asin = get_rank_product_asin(trees, dd[0]) time.sleep(random.randint(0,7)) review_page = dd[0].replace("dp", "product-reviews") page_response = urllib2.urlopen(review_page).read() tree = etree.HTML(page_response) num_pages = get_num_pages(tree) if num_pages == 0.0: num_pages = 1 i = 1 while(i <= num_pages): rr = review_page + "/ref=cm_cr_pr_btm_link_" + str(i) + "?&pageNumber=" + str(i) reviews.append(rr) i+=1 except: review_page = dd[0].replace("dp", "product-reviews") i = 1 while(i <= num_pages): rr = review_page + "/ref=cm_cr_pr_btm_link_" + str(i) + "?&pageNumber=" + str(i) missed.append(rr) i+=1 print "Added to Missed!" reviews = [key for key,_ in groupby(reviews)] i = 1 data_filt = [] combo = [] for rr in reviews: try: tree = scrape_review_page(rr) combo.append([get_title(tree), get_author(tree), get_author_page(tree), get_review_rank(tree), get_date(tree), get_review_link(tree), dd[1], get_brand(tree), rank, key_word, product, asin, ss[1]]) print "Scraping Page #: " + str(i) +"/" +str(len(reviews)) + " Of Product: " + product i+=1 except: try: time.sleep(random.randint(0,5)) tree = scrape_review_page(rr) combo.append([get_title(tree), get_author(tree), get_author_page(tree), get_review_rank(tree), get_date(tree), get_review_link(tree), dd[1], get_brand(tree), rank, key_word, product, asin, ss[1]]) i+=1 except: print "Skipped Page #: " + str(i) +"/" + str(num_pages) + " Of Product: " + product i+=1 try: data_filt.extend([list(tt)+combo[0][-7:] for tt in zip(*combo[0][:-7])]) except: print "Index Wrong.." try: print len(data_filt) filt_combos = remove_duplicates(data_filt) write_csv(data_filt) except: print "Error Final!" try: write_missed(missed) except: print "Cant write Missed URL CSV..." #write_spreadsheet(data_filt) looped +=1 main()
Run
Reset
Share
Import
Link
Embed
Language▼
English
中文
Python Fiddle
Python Cloud IDE
Follow @python_fiddle
Browser Version Not Supported
Due to Python Fiddle's reliance on advanced JavaScript techniques, older browsers might have problems running it correctly. Please download the latest version of your favourite browser.
Chrome 10+
Firefox 4+
Safari 5+
IE 10+
Let me try anyway!
url:
Go
Python Snippet
Stackoverflow Question