diff --git a/ImmoScrap/settings.py b/ImmoScrap/settings.py index 40b4024..94996c5 100644 --- a/ImmoScrap/settings.py +++ b/ImmoScrap/settings.py @@ -14,8 +14,8 @@ NEWSPIDER_MODULE = 'ImmoScrap.spiders' LOG_LEVEL = 'WARNING' FEEDS = { - 'export.json': { - 'format': 'json' + 'export.jsonl': { + 'format': 'jsonlines' } } @@ -32,7 +32,7 @@ ROBOTSTXT_OBEY = False # Configure a delay for requests for the same website (default: 0) # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs -DOWNLOAD_DELAY = 2 # Beetween 1 and 3 seconds : RANDOMIZE_DOWNLOAD_DELAY enabled by default +DOWNLOAD_DELAY = 5 # Beetween 2.5 and 7.5 seconds : RANDOMIZE_DOWNLOAD_DELAY enabled by default # The download delay setting will honor only one of: # CONCURRENT_REQUESTS_PER_DOMAIN = 16 # CONCURRENT_REQUESTS_PER_IP = 16 diff --git a/ImmoScrap/spiders/Seloger.py b/ImmoScrap/spiders/Seloger.py index 644d93a..95a241b 100644 --- a/ImmoScrap/spiders/Seloger.py +++ b/ImmoScrap/spiders/Seloger.py @@ -48,7 +48,7 @@ class SelogerSpider(scrapy.Spider): rooms = infos.xpath('./li[contains(., "p")]/text()').get() bedrooms = infos.xpath('./li[contains(., "ch")]/text()').get() if area is not None: - area = area.replace(' m²', '') + area = area.replace(' m²', '').replace(',', '.') if rooms is not None: rooms = rooms.replace(' p', '') if bedrooms is not None: @@ -60,7 +60,7 @@ class SelogerSpider(scrapy.Spider): 'price': int(ad.css(self.price_css_sel).get().replace('\xa0', '').replace(' €', '')), 'rooms': rooms, 'bedrooms': bedrooms, - 'area': area.replace(',', '.') + 'area': area } active_page = response.url active_page_nb = int(active_page.split('LISTING-LISTpg=')[1])