From 0655c01cc927cdb9d52547dc8e01fb6fd5322c3a Mon Sep 17 00:00:00 2001 From: Jordan ERNST Date: Sun, 16 Aug 2020 01:13:20 +0200 Subject: [PATCH] Added User-Agent rotation --- ImmoScrap.py | 2 +- ImmoScrap/settings.py | 63 +++++++++++++++++++++--------------- ImmoScrap/spiders/Seloger.py | 7 ++-- 3 files changed, 41 insertions(+), 31 deletions(-) diff --git a/ImmoScrap.py b/ImmoScrap.py index 0acd73a..5669f99 100755 --- a/ImmoScrap.py +++ b/ImmoScrap.py @@ -1,4 +1,4 @@ -#!/usr/bin/env ipython3 +#!/usr/bin/env python3 from scrapy.crawler import CrawlerProcess from scrapy.utils.project import get_project_settings diff --git a/ImmoScrap/settings.py b/ImmoScrap/settings.py index d3861bd..3697430 100644 --- a/ImmoScrap/settings.py +++ b/ImmoScrap/settings.py @@ -14,45 +14,56 @@ NEWSPIDER_MODULE = 'ImmoScrap.spiders' # Crawl responsibly by identifying yourself (and your website) on the user-agent -#USER_AGENT = 'ImmoScrap (+http://www.yourdomain.com)' +# USER_AGENT = 'ImmoScrap (+http://www.yourdomain.com)' # Obey robots.txt rules ROBOTSTXT_OBEY = False # Configure maximum concurrent requests performed by Scrapy (default: 16) -#CONCURRENT_REQUESTS = 32 +# CONCURRENT_REQUESTS = 32 # Configure a delay for requests for the same website (default: 0) # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs -#DOWNLOAD_DELAY = 3 +# DOWNLOAD_DELAY = 3 # The download delay setting will honor only one of: -#CONCURRENT_REQUESTS_PER_DOMAIN = 16 -#CONCURRENT_REQUESTS_PER_IP = 16 +# CONCURRENT_REQUESTS_PER_DOMAIN = 16 +# CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default) -#COOKIES_ENABLED = False +# COOKIES_ENABLED = False # Disable Telnet Console (enabled by default) -#TELNETCONSOLE_ENABLED = False +# TELNETCONSOLE_ENABLED = False # Override the default request headers: -#DEFAULT_REQUEST_HEADERS = { +# DEFAULT_REQUEST_HEADERS = { # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Language': 'en', -#} +# } # Enable or disable spider middlewares # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html -#SPIDER_MIDDLEWARES = { +# SPIDER_MIDDLEWARES = { # 'ImmoScrap.middlewares.ImmoscrapSpiderMiddleware': 543, -#} +# } # Enable or disable downloader middlewares # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html -#DOWNLOADER_MIDDLEWARES = { -# 'ImmoScrap.middlewares.ImmoscrapDownloaderMiddleware': 543, -#} + +# Custom middleware to rotate User-Agents: +DOWNLOADER_MIDDLEWARES = { + 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None, + 'scrapy.downloadermiddlewares.retry.RetryMiddleware': None, + 'scrapy_fake_useragent.middleware.RandomUserAgentMiddleware': 400, + 'scrapy_fake_useragent.middleware.RetryUserAgentMiddleware': 401, +} +FAKEUSERAGENT_PROVIDERS = [ + # 'scrapy_fake_useragent.providers.FakeUserAgentProvider', # Depends on http://useragentstring.com which is currently down + 'scrapy_fake_useragent.providers.FakerProvider', # if FakeUserAgentProvider fails, we'll use faker to generate a user-agent string for us + 'scrapy_fake_useragent.providers.FixedUserAgentProvider', # fall back to USER_AGENT value +] +USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:79.0) Gecko/20100101 Firefox/79.0' # Fallback value # Enable or disable extensions # See https://docs.scrapy.org/en/latest/topics/extensions.html @@ -62,27 +73,27 @@ EXTENSIONS = { # Configure item pipelines # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html -#ITEM_PIPELINES = { +# ITEM_PIPELINES = { # 'ImmoScrap.pipelines.ImmoscrapPipeline': 300, -#} +# } # Enable and configure the AutoThrottle extension (disabled by default) # See https://docs.scrapy.org/en/latest/topics/autothrottle.html -#AUTOTHROTTLE_ENABLED = True +# AUTOTHROTTLE_ENABLED = True # The initial download delay -#AUTOTHROTTLE_START_DELAY = 5 +# AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies -#AUTOTHROTTLE_MAX_DELAY = 60 +# AUTOTHROTTLE_MAX_DELAY = 60 # The average number of requests Scrapy should be sending in parallel to # each remote server -#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 +# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: -#AUTOTHROTTLE_DEBUG = False +# AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default) # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings -#HTTPCACHE_ENABLED = True -#HTTPCACHE_EXPIRATION_SECS = 0 -#HTTPCACHE_DIR = 'httpcache' -#HTTPCACHE_IGNORE_HTTP_CODES = [] -#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' +# HTTPCACHE_ENABLED = True +# HTTPCACHE_EXPIRATION_SECS = 0 +# HTTPCACHE_DIR = 'httpcache' +# HTTPCACHE_IGNORE_HTTP_CODES = [] +# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' diff --git a/ImmoScrap/spiders/Seloger.py b/ImmoScrap/spiders/Seloger.py index df62330..8b57425 100644 --- a/ImmoScrap/spiders/Seloger.py +++ b/ImmoScrap/spiders/Seloger.py @@ -2,7 +2,8 @@ import scrapy from urllib.parse import urlencode # from math import ceil -# The only requrement to bypass protections on seloger is to spoof User-Agent +# WRONG : The only requrement to bypass protections on seloger is to spoof User-Agent +# Let's rotate User-Agents class SelogerSpider(scrapy.Spider): @@ -26,8 +27,6 @@ class SelogerSpider(scrapy.Spider): stetienne_params['searchareas'] = stetienne_area stetienne_params['LISTING-LISTpg'] = 1 # From python 3.7 dict keep insertion order. We wand the page to be the last param - headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:78.0) Gecko/20100101 Firefox/78.0'} - ads_css_sel = '.ListContent__SmartClassifiedExtended-sc-1viyr2k-2' url_css_sel = '.dXJclF::attr(href)' type_css_sel = '.joPkKZ::text' @@ -78,4 +77,4 @@ class SelogerSpider(scrapy.Spider): if current_ads_nb != total_ads_nb: # If not last page next_page_nb = active_page_nb + 1 next_page = active_page.split('LISTING-LISTpg=')[0] + f'LISTING-LISTpg={next_page_nb}' - yield response.follow(next_page, headers=self.headers, callback=self.parse) + yield response.follow(next_page, callback=self.parse)