Unify yield values + leboncoin: Fixes

This commit is contained in:
Jordan ERNST 2020-08-03 23:31:38 +02:00
parent 7e58f445a5
commit 814aea4e66
4 changed files with 56 additions and 12 deletions

1
.gitignore vendored
View File

@ -1 +1,2 @@
__pycache__
*.csv

View File

@ -30,12 +30,19 @@ class PAPSpider(scrapy.Spider):
if url[0] != '/': # If url starts with / : on website, else: advertisement
break
rooms = infos.xpath('./li[contains(., "pièce")]/text()').get()
bedrooms = infos.xpath('./li[contains(., "chambre")]/text()').get()
if rooms is not None:
rooms = rooms.replace(' pièce', '').replace('s', '') # s, left if plural
if bedrooms is not None:
bedrooms = bedrooms.replace(' chambre', '').replace('s', '') # s, left if plural
yield {
'url': self.baseurl + url,
'type': 'Unknown',
'price': ad.css(self.price_css_sel).get(),
'rooms': infos.xpath('./li[contains(., "pièces")]/text()').get(),
'bedrooms': infos.xpath('./li[contains(., "chambres")]/text()').get(),
'price': ad.css(self.price_css_sel).get().replace('.', '').replace('\xa0', ''),
'rooms': rooms,
'bedrooms': bedrooms,
'area': infos.xpath('./li[contains(small, "m")]/text()').get()
}

View File

@ -48,13 +48,23 @@ class SelogerSpider(scrapy.Spider):
for ad in ads:
infos = ad.css(self.infos_css_sel)
area = infos.xpath('./li[contains(., "")]/text()').get()
rooms = infos.xpath('./li[contains(., "p")]/text()').get()
bedrooms = infos.xpath('./li[contains(., "ch")]/text()').get()
if area is not None:
area = area.replace('', '')
if rooms is not None:
rooms = rooms.replace(' p', '')
if bedrooms is not None:
bedrooms = bedrooms.replace(' ch', '')
yield {
'url': ad.css(self.url_css_sel).get().split('?')[0],
'type': ad.css(self.type_css_sel).get(),
'price': ad.css(self.price_css_sel).get().replace('\xa0', ' '),
'rooms': infos.xpath('./li[contains(., "p")]/text()').get(),
'bedrooms': infos.xpath('./li[contains(., "ch")]/text()').get(),
'area': infos.xpath('./li[contains(., "")]/text()').get()
'type': ad.css(self.type_css_sel).get().replace('/Villa', ''),
'price': ad.css(self.price_css_sel).get().replace('\xa0', '').replace(' ', ''),
'rooms': rooms,
'bedrooms': bedrooms,
'area': area
}
"""
pricefloat = float(price.replace(',', '.').replace(' ', '').replace('', ''))

View File

@ -1,6 +1,29 @@
import scrapy
def get_attributes(attributes):
for attribute in attributes:
if attribute['key'] == 'real_estate_type':
type = attribute['value_label']
elif attribute['key'] == 'rooms':
rooms = attribute['value']
elif attribute['key'] == 'square':
area = attribute['value']
try:
type
except NameError:
type = None
try:
rooms
except NameError:
rooms = None
try:
area
except NameError:
area = None
return type, rooms, area
class LeboncoinSpider(scrapy.Spider):
name = "leboncoin"
@ -58,13 +81,16 @@ class LeboncoinSpider(scrapy.Spider):
json = response.json()
ads = json["ads"]
for ad in ads:
attributes = ad['attributes']
type, rooms, area = get_attributes(attributes)
yield {
'url': ad['url'],
'type': ad['attributes'][0]['value_label'],
'price': ad['price'],
'rooms': ad['attributes'][2]['value'],
'type': type,
'price': ad['price'][0],
'rooms': rooms,
'bedrooms': 'Unknown',
'area': ad['attributes'][1]['value']
'area': area
}
total_ads_nb = json["total"]