Last changes
This commit is contained in:
parent
ec7acf9308
commit
6ce018418a
|
@ -14,8 +14,8 @@ NEWSPIDER_MODULE = 'ImmoScrap.spiders'
|
||||||
LOG_LEVEL = 'WARNING'
|
LOG_LEVEL = 'WARNING'
|
||||||
|
|
||||||
FEEDS = {
|
FEEDS = {
|
||||||
'export.json': {
|
'export.jsonl': {
|
||||||
'format': 'json'
|
'format': 'jsonlines'
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -32,7 +32,7 @@ ROBOTSTXT_OBEY = False
|
||||||
# Configure a delay for requests for the same website (default: 0)
|
# Configure a delay for requests for the same website (default: 0)
|
||||||
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
|
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
|
||||||
# See also autothrottle settings and docs
|
# See also autothrottle settings and docs
|
||||||
DOWNLOAD_DELAY = 2 # Beetween 1 and 3 seconds : RANDOMIZE_DOWNLOAD_DELAY enabled by default
|
DOWNLOAD_DELAY = 5 # Beetween 2.5 and 7.5 seconds : RANDOMIZE_DOWNLOAD_DELAY enabled by default
|
||||||
# The download delay setting will honor only one of:
|
# The download delay setting will honor only one of:
|
||||||
# CONCURRENT_REQUESTS_PER_DOMAIN = 16
|
# CONCURRENT_REQUESTS_PER_DOMAIN = 16
|
||||||
# CONCURRENT_REQUESTS_PER_IP = 16
|
# CONCURRENT_REQUESTS_PER_IP = 16
|
||||||
|
|
|
@ -48,7 +48,7 @@ class SelogerSpider(scrapy.Spider):
|
||||||
rooms = infos.xpath('./li[contains(., "p")]/text()').get()
|
rooms = infos.xpath('./li[contains(., "p")]/text()').get()
|
||||||
bedrooms = infos.xpath('./li[contains(., "ch")]/text()').get()
|
bedrooms = infos.xpath('./li[contains(., "ch")]/text()').get()
|
||||||
if area is not None:
|
if area is not None:
|
||||||
area = area.replace(' m²', '')
|
area = area.replace(' m²', '').replace(',', '.')
|
||||||
if rooms is not None:
|
if rooms is not None:
|
||||||
rooms = rooms.replace(' p', '')
|
rooms = rooms.replace(' p', '')
|
||||||
if bedrooms is not None:
|
if bedrooms is not None:
|
||||||
|
@ -60,7 +60,7 @@ class SelogerSpider(scrapy.Spider):
|
||||||
'price': int(ad.css(self.price_css_sel).get().replace('\xa0', '').replace(' €', '')),
|
'price': int(ad.css(self.price_css_sel).get().replace('\xa0', '').replace(' €', '')),
|
||||||
'rooms': rooms,
|
'rooms': rooms,
|
||||||
'bedrooms': bedrooms,
|
'bedrooms': bedrooms,
|
||||||
'area': area.replace(',', '.')
|
'area': area
|
||||||
}
|
}
|
||||||
active_page = response.url
|
active_page = response.url
|
||||||
active_page_nb = int(active_page.split('LISTING-LISTpg=')[1])
|
active_page_nb = int(active_page.split('LISTING-LISTpg=')[1])
|
||||||
|
|
Loading…
Reference in New Issue