Unify yield values + leboncoin: Fixes
This commit is contained in:
parent
7e58f445a5
commit
814aea4e66
|
@ -1 +1,2 @@
|
|||
__pycache__
|
||||
*.csv
|
||||
|
|
|
@ -30,12 +30,19 @@ class PAPSpider(scrapy.Spider):
|
|||
if url[0] != '/': # If url starts with / : on website, else: advertisement
|
||||
break
|
||||
|
||||
rooms = infos.xpath('./li[contains(., "pièce")]/text()').get()
|
||||
bedrooms = infos.xpath('./li[contains(., "chambre")]/text()').get()
|
||||
if rooms is not None:
|
||||
rooms = rooms.replace(' pièce', '').replace('s', '') # s, left if plural
|
||||
if bedrooms is not None:
|
||||
bedrooms = bedrooms.replace(' chambre', '').replace('s', '') # s, left if plural
|
||||
|
||||
yield {
|
||||
'url': self.baseurl + url,
|
||||
'type': 'Unknown',
|
||||
'price': ad.css(self.price_css_sel).get(),
|
||||
'rooms': infos.xpath('./li[contains(., "pièces")]/text()').get(),
|
||||
'bedrooms': infos.xpath('./li[contains(., "chambres")]/text()').get(),
|
||||
'price': ad.css(self.price_css_sel).get().replace('.', '').replace('\xa0€', ''),
|
||||
'rooms': rooms,
|
||||
'bedrooms': bedrooms,
|
||||
'area': infos.xpath('./li[contains(small, "m")]/text()').get()
|
||||
}
|
||||
|
||||
|
|
|
@ -48,13 +48,23 @@ class SelogerSpider(scrapy.Spider):
|
|||
for ad in ads:
|
||||
infos = ad.css(self.infos_css_sel)
|
||||
|
||||
area = infos.xpath('./li[contains(., "m²")]/text()').get()
|
||||
rooms = infos.xpath('./li[contains(., "p")]/text()').get()
|
||||
bedrooms = infos.xpath('./li[contains(., "ch")]/text()').get()
|
||||
if area is not None:
|
||||
area = area.replace(' m²', '')
|
||||
if rooms is not None:
|
||||
rooms = rooms.replace(' p', '')
|
||||
if bedrooms is not None:
|
||||
bedrooms = bedrooms.replace(' ch', '')
|
||||
|
||||
yield {
|
||||
'url': ad.css(self.url_css_sel).get().split('?')[0],
|
||||
'type': ad.css(self.type_css_sel).get(),
|
||||
'price': ad.css(self.price_css_sel).get().replace('\xa0', ' '),
|
||||
'rooms': infos.xpath('./li[contains(., "p")]/text()').get(),
|
||||
'bedrooms': infos.xpath('./li[contains(., "ch")]/text()').get(),
|
||||
'area': infos.xpath('./li[contains(., "m²")]/text()').get()
|
||||
'type': ad.css(self.type_css_sel).get().replace('/Villa', ''),
|
||||
'price': ad.css(self.price_css_sel).get().replace('\xa0', '').replace(' €', ''),
|
||||
'rooms': rooms,
|
||||
'bedrooms': bedrooms,
|
||||
'area': area
|
||||
}
|
||||
"""
|
||||
pricefloat = float(price.replace(',', '.').replace(' ', '').replace('€', ''))
|
||||
|
|
|
@ -1,6 +1,29 @@
|
|||
import scrapy
|
||||
|
||||
|
||||
def get_attributes(attributes):
|
||||
for attribute in attributes:
|
||||
if attribute['key'] == 'real_estate_type':
|
||||
type = attribute['value_label']
|
||||
elif attribute['key'] == 'rooms':
|
||||
rooms = attribute['value']
|
||||
elif attribute['key'] == 'square':
|
||||
area = attribute['value']
|
||||
try:
|
||||
type
|
||||
except NameError:
|
||||
type = None
|
||||
try:
|
||||
rooms
|
||||
except NameError:
|
||||
rooms = None
|
||||
try:
|
||||
area
|
||||
except NameError:
|
||||
area = None
|
||||
return type, rooms, area
|
||||
|
||||
|
||||
class LeboncoinSpider(scrapy.Spider):
|
||||
name = "leboncoin"
|
||||
|
||||
|
@ -58,13 +81,16 @@ class LeboncoinSpider(scrapy.Spider):
|
|||
json = response.json()
|
||||
ads = json["ads"]
|
||||
for ad in ads:
|
||||
attributes = ad['attributes']
|
||||
type, rooms, area = get_attributes(attributes)
|
||||
|
||||
yield {
|
||||
'url': ad['url'],
|
||||
'type': ad['attributes'][0]['value_label'],
|
||||
'price': ad['price'],
|
||||
'rooms': ad['attributes'][2]['value'],
|
||||
'type': type,
|
||||
'price': ad['price'][0],
|
||||
'rooms': rooms,
|
||||
'bedrooms': 'Unknown',
|
||||
'area': ad['attributes'][1]['value']
|
||||
'area': area
|
||||
}
|
||||
|
||||
total_ads_nb = json["total"]
|
||||
|
|
Loading…
Reference in New Issue