From 0655c01cc927cdb9d52547dc8e01fb6fd5322c3a Mon Sep 17 00:00:00 2001
From: Jordan ERNST <pro.ernst@gmail.com>
Date: Sun, 16 Aug 2020 01:13:20 +0200
Subject: [PATCH] Added User-Agent rotation

---
 ImmoScrap.py                 |  2 +-
 ImmoScrap/settings.py        | 63 +++++++++++++++++++++---------------
 ImmoScrap/spiders/Seloger.py |  7 ++--
 3 files changed, 41 insertions(+), 31 deletions(-)

diff --git a/ImmoScrap.py b/ImmoScrap.py
index 0acd73a..5669f99 100755
--- a/ImmoScrap.py
+++ b/ImmoScrap.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env ipython3
+#!/usr/bin/env python3
 
 from scrapy.crawler import CrawlerProcess
 from scrapy.utils.project import get_project_settings
diff --git a/ImmoScrap/settings.py b/ImmoScrap/settings.py
index d3861bd..3697430 100644
--- a/ImmoScrap/settings.py
+++ b/ImmoScrap/settings.py
@@ -14,45 +14,56 @@ NEWSPIDER_MODULE = 'ImmoScrap.spiders'
 
 
 # Crawl responsibly by identifying yourself (and your website) on the user-agent
-#USER_AGENT = 'ImmoScrap (+http://www.yourdomain.com)'
+# USER_AGENT = 'ImmoScrap (+http://www.yourdomain.com)'
 
 # Obey robots.txt rules
 ROBOTSTXT_OBEY = False
 
 # Configure maximum concurrent requests performed by Scrapy (default: 16)
-#CONCURRENT_REQUESTS = 32
+# CONCURRENT_REQUESTS = 32
 
 # Configure a delay for requests for the same website (default: 0)
 # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
 # See also autothrottle settings and docs
-#DOWNLOAD_DELAY = 3
+# DOWNLOAD_DELAY = 3
 # The download delay setting will honor only one of:
-#CONCURRENT_REQUESTS_PER_DOMAIN = 16
-#CONCURRENT_REQUESTS_PER_IP = 16
+# CONCURRENT_REQUESTS_PER_DOMAIN = 16
+# CONCURRENT_REQUESTS_PER_IP = 16
 
 # Disable cookies (enabled by default)
-#COOKIES_ENABLED = False
+# COOKIES_ENABLED = False
 
 # Disable Telnet Console (enabled by default)
-#TELNETCONSOLE_ENABLED = False
+# TELNETCONSOLE_ENABLED = False
 
 # Override the default request headers:
-#DEFAULT_REQUEST_HEADERS = {
+# DEFAULT_REQUEST_HEADERS = {
 #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 #   'Accept-Language': 'en',
-#}
+# }
 
 # Enable or disable spider middlewares
 # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
-#SPIDER_MIDDLEWARES = {
+# SPIDER_MIDDLEWARES = {
 #    'ImmoScrap.middlewares.ImmoscrapSpiderMiddleware': 543,
-#}
+# }
 
 # Enable or disable downloader middlewares
 # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
-#DOWNLOADER_MIDDLEWARES = {
-#    'ImmoScrap.middlewares.ImmoscrapDownloaderMiddleware': 543,
-#}
+
+# Custom middleware to rotate User-Agents:
+DOWNLOADER_MIDDLEWARES = {
+    'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
+    'scrapy.downloadermiddlewares.retry.RetryMiddleware': None,
+    'scrapy_fake_useragent.middleware.RandomUserAgentMiddleware': 400,
+    'scrapy_fake_useragent.middleware.RetryUserAgentMiddleware': 401,
+}
+FAKEUSERAGENT_PROVIDERS = [
+    # 'scrapy_fake_useragent.providers.FakeUserAgentProvider',  # Depends on http://useragentstring.com which is currently down
+    'scrapy_fake_useragent.providers.FakerProvider',  # if FakeUserAgentProvider fails, we'll use faker to generate a user-agent string for us
+    'scrapy_fake_useragent.providers.FixedUserAgentProvider',  # fall back to USER_AGENT value
+]
+USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:79.0) Gecko/20100101 Firefox/79.0'  # Fallback value
 
 # Enable or disable extensions
 # See https://docs.scrapy.org/en/latest/topics/extensions.html
@@ -62,27 +73,27 @@ EXTENSIONS = {
 
 # Configure item pipelines
 # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
-#ITEM_PIPELINES = {
+# ITEM_PIPELINES = {
 #    'ImmoScrap.pipelines.ImmoscrapPipeline': 300,
-#}
+# }
 
 # Enable and configure the AutoThrottle extension (disabled by default)
 # See https://docs.scrapy.org/en/latest/topics/autothrottle.html
-#AUTOTHROTTLE_ENABLED = True
+# AUTOTHROTTLE_ENABLED = True
 # The initial download delay
-#AUTOTHROTTLE_START_DELAY = 5
+# AUTOTHROTTLE_START_DELAY = 5
 # The maximum download delay to be set in case of high latencies
-#AUTOTHROTTLE_MAX_DELAY = 60
+# AUTOTHROTTLE_MAX_DELAY = 60
 # The average number of requests Scrapy should be sending in parallel to
 # each remote server
-#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
+# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
 # Enable showing throttling stats for every response received:
-#AUTOTHROTTLE_DEBUG = False
+# AUTOTHROTTLE_DEBUG = False
 
 # Enable and configure HTTP caching (disabled by default)
 # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
-#HTTPCACHE_ENABLED = True
-#HTTPCACHE_EXPIRATION_SECS = 0
-#HTTPCACHE_DIR = 'httpcache'
-#HTTPCACHE_IGNORE_HTTP_CODES = []
-#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
+# HTTPCACHE_ENABLED = True
+# HTTPCACHE_EXPIRATION_SECS = 0
+# HTTPCACHE_DIR = 'httpcache'
+# HTTPCACHE_IGNORE_HTTP_CODES = []
+# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
diff --git a/ImmoScrap/spiders/Seloger.py b/ImmoScrap/spiders/Seloger.py
index df62330..8b57425 100644
--- a/ImmoScrap/spiders/Seloger.py
+++ b/ImmoScrap/spiders/Seloger.py
@@ -2,7 +2,8 @@ import scrapy
 from urllib.parse import urlencode
 # from math import ceil
 
-# The only requrement to bypass protections on seloger is to spoof User-Agent
+# WRONG : The only requrement to bypass protections on seloger is to spoof User-Agent
+# Let's rotate User-Agents
 
 
 class SelogerSpider(scrapy.Spider):
@@ -26,8 +27,6 @@ class SelogerSpider(scrapy.Spider):
     stetienne_params['searchareas'] = stetienne_area
     stetienne_params['LISTING-LISTpg'] = 1             # From python 3.7 dict keep insertion order. We wand the page to be the last param
 
-    headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:78.0) Gecko/20100101 Firefox/78.0'}
-
     ads_css_sel = '.ListContent__SmartClassifiedExtended-sc-1viyr2k-2'
     url_css_sel = '.dXJclF::attr(href)'
     type_css_sel = '.joPkKZ::text'
@@ -78,4 +77,4 @@ class SelogerSpider(scrapy.Spider):
         if current_ads_nb != total_ads_nb:    # If not last page
             next_page_nb = active_page_nb + 1
             next_page = active_page.split('LISTING-LISTpg=')[0] + f'LISTING-LISTpg={next_page_nb}'
-            yield response.follow(next_page, headers=self.headers, callback=self.parse)
+            yield response.follow(next_page, callback=self.parse)