from datetime import datetime import time import csv import re from bs4 import BeautifulSoup from selenium import webdriver from selenium.webdriver.chrome.options import Options from selenium.webdriver.support.ui import Select from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.common.by import By from selenium.common.exceptions import TimeoutException from selenium.webdriver.common.keys import Keys class Spider(object): """Spider class with crawling functions.""" def __init__(self): options = Options() options.add_argument('--headless') # Runs Chrome in headless mode. options.add_argument('--no-sandbox') # Bypass OS security model options.add_argument('start-maximized') options.add_argument('disable-infobars') options.add_argument('--disable-extensions') options.add_argument('disable-blink-features=AutomationControlled') options.add_argument('user-agent=fake-useragent') CHROMEDRIVER_PATH = '/Users/saxena/tutorial-env/chromedriver' self.driver = webdriver.Chrome(CHROMEDRIVER_PATH, options=options) self.outdict = [] self.tempdict = {} self.img_base_path = "https://www.ethicon.com" self.soup = BeautifulSoup('<html></html>', 'html.parser') def sanitize_text(self, text): # text = str(text, "utf-8") # text = str(text).replace(':', '').rstrip('\n').encode('utf8').strip() text = str(text).replace(' ', '') # text = str(text).replace('^ ', '').replace(' $', '') text = str(text).strip() return text def fetch_summarylinks(self, url): self.driver.get(url) elems = self.driver.find_elements_by_xpath("//a[@href]") for elem in elems: elem_url = elem.get_attribute("href") if re.match(".*/product/.*", elem_url): print(elem_url) # pagination code while True: WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, "pager__item--next"))) next_page_btn = self.driver.find_element_by_class_name("pager__item--next") next_page_btn_flag = next_page_btn.get_attribute("aria-disabled") if next_page_btn_flag == "true": print("No more pages left") break else: next_page_elem = next_page_btn.find_element_by_xpath(".//a[@href]") # for elem in next_page_elems: # print(elem.get_attribute("href")) # print(elem.get_attribute("title")) print(next_page_elem.get_attribute("href")) time.sleep(5) self.fetch_summarylinks(next_page_elem.get_attribute("href")) def fetch_productlinks(self, url): self.driver.get(url) elems = self.driver.find_elements_by_xpath("//a[@href]") # print(elems[0].get_attribute("href")) counter = 0 for elem in elems: elem_url = elem.get_attribute("href") if re.match(".*/code/.*", elem_url): # print(elem_url) if counter <= 1: self.extract_imagesandmetadata(elem_url, self.driver) counter += 1 def extract_imagesandmetadata(self, url, driver): # driver.find_element_by_tag_name('body').send_keys(Keys.COMMAND + 't') before_window = driver.window_handles[0] driver.find_element_by_tag_name('body').send_keys(Keys.COMMAND + 't') driver.get(url) after_window = driver.window_handles[0] driver.switch_to.window(after_window) print("Crawling ..." + url) html = driver.page_source if html: self.soup = BeautifulSoup(html, 'html.parser') pimg = self.soup.find('img', {'class': 'img-responsive'}) if pimg: print(pimg["src"]) self.tempdict["pimg"] = self.img_base_path + pimg["src"] else: self.tempdict["pimg"] = "" ptitle = self.soup.find('h1', {'class': 'eprc-title'}) if ptitle: print(self.sanitize_text(ptitle.text)) self.tempdict["ptitle"] = self.sanitize_text(ptitle.text) else: self.tempdict["ptitle"] = "" self.tempdict["purl"] = url self.outdict.append(self.tempdict) driver.switch_to.window(before_window) def main(self): start_time = datetime.now() url = 'https://www.jnjmedicaldevices.com/en-US/company/ethicon/products?items_per_page=50' # self.fetch_summarylinks(url) self.fetch_productlinks("https://www.jnjmedicaldevices.com/en-US/product/endopath-xcel-trocars") # self.extract_imagesandmetadata("https://www.ethicon.com/na/epc/code/b11lp?lang=en-default") self.driver.close() exit(0) keys = self.outdict[0].keys() with open('_log.csv', 'w') as output_file: dict_writer = csv.DictWriter(output_file, keys) dict_writer.writeheader() dict_writer.writerows(self.outdict) print('Time taken in minutes : ' + str( round((datetime.now() - start_time).total_seconds() / 60.0, 2))) if __name__ == '__main__': Spider().main()
Run
Reset
Share
Import
Link
Embed
Language▼
English
中文
Python Fiddle
Python Cloud IDE
Follow @python_fiddle
Browser Version Not Supported
Due to Python Fiddle's reliance on advanced JavaScript techniques, older browsers might have problems running it correctly. Please download the latest version of your favourite browser.
Chrome 10+
Firefox 4+
Safari 5+
IE 10+
Let me try anyway!
url:
Go
Python Snippet
Stackoverflow Question