Added User-Agent rotation

2020-08-16 01:13:20 +02:00 · 2020-08-16 01:13:20 +02:00 · 0655c01cc9
parent 3771e4d507
commit 0655c01cc9
3 changed files with 41 additions and 31 deletions
--- a/ImmoScrap.py
+++ b/ImmoScrap.py
@ -1,4 +1,4 @@
-#!/usr/bin/env ipython3
+#!/usr/bin/env python3

 from scrapy.crawler import CrawlerProcess
 from scrapy.utils.project import get_project_settings
--- a/ImmoScrap/settings.py
+++ b/ImmoScrap/settings.py
@ -14,45 +14,56 @@ NEWSPIDER_MODULE = 'ImmoScrap.spiders'


 # Crawl responsibly by identifying yourself (and your website) on the user-agent
-#USER_AGENT = 'ImmoScrap (+http://www.yourdomain.com)'
+# USER_AGENT = 'ImmoScrap (+http://www.yourdomain.com)'

 # Obey robots.txt rules
 ROBOTSTXT_OBEY = False

 # Configure maximum concurrent requests performed by Scrapy (default: 16)
-#CONCURRENT_REQUESTS = 32
+# CONCURRENT_REQUESTS = 32

 # Configure a delay for requests for the same website (default: 0)
 # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
 # See also autothrottle settings and docs
-#DOWNLOAD_DELAY = 3
+# DOWNLOAD_DELAY = 3
 # The download delay setting will honor only one of:
-#CONCURRENT_REQUESTS_PER_DOMAIN = 16
-#CONCURRENT_REQUESTS_PER_IP = 16
+# CONCURRENT_REQUESTS_PER_DOMAIN = 16
+# CONCURRENT_REQUESTS_PER_IP = 16

 # Disable cookies (enabled by default)
-#COOKIES_ENABLED = False
+# COOKIES_ENABLED = False

 # Disable Telnet Console (enabled by default)
-#TELNETCONSOLE_ENABLED = False
+# TELNETCONSOLE_ENABLED = False

 # Override the default request headers:
-#DEFAULT_REQUEST_HEADERS = {
+# DEFAULT_REQUEST_HEADERS = {
 #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 #   'Accept-Language': 'en',
-#}
+# }

 # Enable or disable spider middlewares
 # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
-#SPIDER_MIDDLEWARES = {
+# SPIDER_MIDDLEWARES = {
 #    'ImmoScrap.middlewares.ImmoscrapSpiderMiddleware': 543,
-#}
+# }

 # Enable or disable downloader middlewares
 # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
-#DOWNLOADER_MIDDLEWARES = {
-#    'ImmoScrap.middlewares.ImmoscrapDownloaderMiddleware': 543,
-#}
+
+# Custom middleware to rotate User-Agents:
+DOWNLOADER_MIDDLEWARES = {
+    'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
+    'scrapy.downloadermiddlewares.retry.RetryMiddleware': None,
+    'scrapy_fake_useragent.middleware.RandomUserAgentMiddleware': 400,
+    'scrapy_fake_useragent.middleware.RetryUserAgentMiddleware': 401,
+}
+FAKEUSERAGENT_PROVIDERS = [
+    # 'scrapy_fake_useragent.providers.FakeUserAgentProvider',  # Depends on http://useragentstring.com which is currently down
+    'scrapy_fake_useragent.providers.FakerProvider',  # if FakeUserAgentProvider fails, we'll use faker to generate a user-agent string for us
+    'scrapy_fake_useragent.providers.FixedUserAgentProvider',  # fall back to USER_AGENT value
+]
+USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:79.0) Gecko/20100101 Firefox/79.0'  # Fallback value

 # Enable or disable extensions
 # See https://docs.scrapy.org/en/latest/topics/extensions.html
@ -62,27 +73,27 @@ EXTENSIONS = {

 # Configure item pipelines
 # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
-#ITEM_PIPELINES = {
+# ITEM_PIPELINES = {
 #    'ImmoScrap.pipelines.ImmoscrapPipeline': 300,
-#}
+# }

 # Enable and configure the AutoThrottle extension (disabled by default)
 # See https://docs.scrapy.org/en/latest/topics/autothrottle.html
-#AUTOTHROTTLE_ENABLED = True
+# AUTOTHROTTLE_ENABLED = True
 # The initial download delay
-#AUTOTHROTTLE_START_DELAY = 5
+# AUTOTHROTTLE_START_DELAY = 5
 # The maximum download delay to be set in case of high latencies
-#AUTOTHROTTLE_MAX_DELAY = 60
+# AUTOTHROTTLE_MAX_DELAY = 60
 # The average number of requests Scrapy should be sending in parallel to
 # each remote server
-#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
+# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
 # Enable showing throttling stats for every response received:
-#AUTOTHROTTLE_DEBUG = False
+# AUTOTHROTTLE_DEBUG = False

 # Enable and configure HTTP caching (disabled by default)
 # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
-#HTTPCACHE_ENABLED = True
-#HTTPCACHE_EXPIRATION_SECS = 0
-#HTTPCACHE_DIR = 'httpcache'
-#HTTPCACHE_IGNORE_HTTP_CODES = []
-#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
+# HTTPCACHE_ENABLED = True
+# HTTPCACHE_EXPIRATION_SECS = 0
+# HTTPCACHE_DIR = 'httpcache'
+# HTTPCACHE_IGNORE_HTTP_CODES = []
+# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
--- a/ImmoScrap/spiders/Seloger.py
+++ b/ImmoScrap/spiders/Seloger.py
@ -2,7 +2,8 @@ import scrapy
 from urllib.parse import urlencode
 # from math import ceil

-# The only requrement to bypass protections on seloger is to spoof User-Agent
+# WRONG : The only requrement to bypass protections on seloger is to spoof User-Agent
+# Let's rotate User-Agents


 class SelogerSpider(scrapy.Spider):
@ -26,8 +27,6 @@ class SelogerSpider(scrapy.Spider):
    stetienne_params['searchareas'] = stetienne_area
    stetienne_params['LISTING-LISTpg'] = 1             # From python 3.7 dict keep insertion order. We wand the page to be the last param

-    headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:78.0) Gecko/20100101 Firefox/78.0'}
-
    ads_css_sel = '.ListContent__SmartClassifiedExtended-sc-1viyr2k-2'
    url_css_sel = '.dXJclF::attr(href)'
    type_css_sel = '.joPkKZ::text'
@ -78,4 +77,4 @@ class SelogerSpider(scrapy.Spider):
        if current_ads_nb != total_ads_nb:    # If not last page
            next_page_nb = active_page_nb + 1
            next_page = active_page.split('LISTING-LISTpg=')[0] + f'LISTING-LISTpg={next_page_nb}'
-            yield response.follow(next_page, headers=self.headers, callback=self.parse)
+            yield response.follow(next_page, callback=self.parse)