1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56
| from selenium import webdriver from selenium.common.exceptions import TimeoutException from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from scrapy.http import HtmlResponse from logging import getLogger import time
class SeleniumMiddleware(): def __init__(self, timeout=None, service_args=[]): self.logger = getLogger(__name__) self.timeout = timeout self.browser = webdriver.PhantomJS(service_args=service_args) self.browser.set_window_size(1400, 700) self.browser.set_page_load_timeout(self.timeout) self.wait = WebDriverWait(self.browser, self.timeout)
def __del__(self): self.browser.close()
def process_request(self, request, spider): """ 用PhantomJS抓取页面 :param request: Request对象 :param spider: Spider对象 :return: HtmlResponse """ if 'image.baidu.com/search/flip' in request.url: self.logger.debug('image.baidu.com/search/flip PhantomJS is Starting') try: self.browser.get(request.url) time.sleep(3) self.wait.until( EC.presence_of_element_located((By.CSS_SELECTOR, '.imglink')))
return HtmlResponse(url=request.url, body=self.browser.page_source, request=request, encoding='utf-8', status=200) except TimeoutException: return HtmlResponse(url=request.url, status=500, request=request) elif 'image.baidu.com/search/detail' in request.url: self.logger.debug('image.baidu.com/search/detail PhantomJS is Starting') try: self.browser.get(request.url) time.sleep(3) self.wait.until( EC.presence_of_element_located((By.CSS_SELECTOR, '#currentImg')))
return HtmlResponse(url=request.url, body=self.browser.page_source, request=request, encoding='utf-8', status=200) except TimeoutException: return HtmlResponse(url=request.url, status=500, request=request) @classmethod def from_crawler(cls, crawler): return cls(timeout=crawler.settings.get('SELENIUM_TIMEOUT'), service_args=crawler.settings.get('PHANTOMJS_SERVICE_ARGS'))
|