import sys import logging import datetime import scrapy from scrapy.crawler import CrawlerProcess import json from argparse import Namespace import requests import re import os logger = logging.getLogger(__name__) import pprint pp = pprint.PrettyPrinter(indent=4) class CoopSpider(scrapy.Spider): indexAgo = 0 def __init__(self, inDown=None, outDown=None, *args, **kwargs): logger.debug("init of %s",self.__class__.__name__) if inDown is None: logger.error('Failed to find input parameter') raise Exception('unknown inDown') if outDown is None: logger.error('Failed to find output parameter') raise Exception('unknown outDown') self.start_urls = [inDown.startUrl] self.startUrl = inDown.startUrl self.jsonsList = [] self.outputFolder = inDown.outputFolder self.outDown = outDown self.outDown.links = [] super(scrapy.Spider, self).__init__(*args, **kwargs) logger.debug("end init of %s",self.__class__.__name__) def parse(self, response): try: logger.debug("parse of %s",self.__class__.__name__) #id="date_selector" params={ 'type':'gzip', 'agree':'1' } newUrl = 'http://coopisrael.coop/home/branches_to_xml' yield scrapy.FormRequest(url=newUrl, method='POST', callback=self.parseBranches, formdata=params) except: logging.exception('parse') with open('lastHtml.htm', 'w') as f: f.write(response.body) raise def parseBranches(self, response): logger.info('parseBranches started') #pp.pprint(response.__dict__) contentDisp = response.headers.get('Content-Disposition').decode('utf-8') #logger.info('contentDisp=%s'%contentDisp) nameToSave = re.findall('filename=(.*)',contentDisp)[0] logger.info('nameToSave=%s'%nameToSave) filePath = self.outputFolder+nameToSave if not os.path.exists(filePath) or os.stat(filePath).st_size == 0: with open(filePath, "w") as handle: handle.write(response._body) else: logger.info('skipping file %s'%filePath) def findAllJsons(self, response): currentPage = int(response.meta.get('currentPage')) lastPageNum = int(response.meta.get('lastPage')) jsonListPage = self.parsePage(response) logger.info('parsing:%s json len:%s page num:%s/%s'%(response.url, len(jsonListPage), currentPage, lastPageNum)) if currentPage < lastPageNum: nextPage = currentPage + 1 pageUrl = 'http://prices.super-pharm.co.il/?type=&page=%s&date=%s'%(nextPage, self.urlDate ) request = scrapy.Request(pageUrl, callback=self.findAllJsons) request.meta['currentPage'] = nextPage request.meta['lastPage'] = lastPageNum return request else: logger.info('parsed last page, total jsons:%s'%len(self.jsonsList)) #continue with downloading loop jsonPath = self.jsonsList.pop() logger.info('starting to download:%s item left:%s'%(jsonPath, len(self.jsonsList))) return scrapy.Request(jsonPath, callback=self.parseFileJson) def parseFileJson(self, response): jBody = json.loads(response.body) if(jBody['status']==0): #logger.debug('parseFileJson, urljoin:'+response.urljoin(jBody['href'])) return scrapy.Request(response.urljoin(jBody['href']), callback=self.parseFilePath) #logger.debug('body:%s type:%s'%(response.body, type(response.body))) def parseFilePath(self, response): #logger.debug('parseFilePath, response[headers].__dict__:%s'%response.headers.__dict__) nameToSave = re.findall('filename="(.*)"',response.headers.get('Content-Disposition'))[0] filePath = self.outputFolder+nameToSave if not os.path.exists(filePath) or os.stat(filePath).st_size == 0: with open(filePath, "w") as handle: handle.write(response._body) else: logger.info('skipping file %s'%filePath) if self.jsonsList: jsonPath = self.jsonsList.pop() logger.info('starting to download:%s item left:%s'%(jsonPath, len(self.jsonsList))) return scrapy.Request(jsonPath, callback=self.parseFileJson) if __name__ == "__main__": try: logging.basicConfig(level=logging.INFO,stream=sys.stdout) logger.info('start') process = CrawlerProcess({ 'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)', 'LOG_ENABLED': False }) logging.getLogger('scrapy').setLevel(logging.WARNING) logging.getLogger('requests.packages.urllib3.connectionpool').setLevel(logging.WARNING) inDown = Namespace() now = datetime.datetime.now() #folder = '/home/foodmarket/dailysources/coop/python/'+datetime.datetime.strftime(now,'%Y%m')+'/'+datetime.datetime.strftime(now,'%Y%m%d')+'/' folder = './coop/python/'+datetime.datetime.strftime(now,'%Y%m')+'/'+datetime.datetime.strftime(now,'%Y%m%d')+'/' if not os.path.exists(folder): os.makedirs(folder) inDown.outputFolder = folder inDown.startUrl = 'http://coopisrael.coop/' outDown = Namespace() logger.info('start1') process.crawl(CoopSpider, inDown=inDown, outDown=outDown) # your spider here logger.info('start1') #r = requests.post(url) process.start() #downloadOnly(url, folder) logger.info('Program Complete') except Exception as e: logging.exception("message")
Run
Reset
Share
Import
Link
Embed
Language▼
English
中文
Python Fiddle
Python Cloud IDE
Follow @python_fiddle
Browser Version Not Supported
Due to Python Fiddle's reliance on advanced JavaScript techniques, older browsers might have problems running it correctly. Please download the latest version of your favourite browser.
Chrome 10+
Firefox 4+
Safari 5+
IE 10+
Let me try anyway!
url:
Go
Python Snippet
Stackoverflow Question