Unify yield values + leboncoin: Fixes
This commit is contained in:
parent
7e58f445a5
commit
814aea4e66
|
@ -1 +1,2 @@
|
||||||
__pycache__
|
__pycache__
|
||||||
|
*.csv
|
||||||
|
|
|
@ -30,12 +30,19 @@ class PAPSpider(scrapy.Spider):
|
||||||
if url[0] != '/': # If url starts with / : on website, else: advertisement
|
if url[0] != '/': # If url starts with / : on website, else: advertisement
|
||||||
break
|
break
|
||||||
|
|
||||||
|
rooms = infos.xpath('./li[contains(., "pièce")]/text()').get()
|
||||||
|
bedrooms = infos.xpath('./li[contains(., "chambre")]/text()').get()
|
||||||
|
if rooms is not None:
|
||||||
|
rooms = rooms.replace(' pièce', '').replace('s', '') # s, left if plural
|
||||||
|
if bedrooms is not None:
|
||||||
|
bedrooms = bedrooms.replace(' chambre', '').replace('s', '') # s, left if plural
|
||||||
|
|
||||||
yield {
|
yield {
|
||||||
'url': self.baseurl + url,
|
'url': self.baseurl + url,
|
||||||
'type': 'Unknown',
|
'type': 'Unknown',
|
||||||
'price': ad.css(self.price_css_sel).get(),
|
'price': ad.css(self.price_css_sel).get().replace('.', '').replace('\xa0€', ''),
|
||||||
'rooms': infos.xpath('./li[contains(., "pièces")]/text()').get(),
|
'rooms': rooms,
|
||||||
'bedrooms': infos.xpath('./li[contains(., "chambres")]/text()').get(),
|
'bedrooms': bedrooms,
|
||||||
'area': infos.xpath('./li[contains(small, "m")]/text()').get()
|
'area': infos.xpath('./li[contains(small, "m")]/text()').get()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -48,13 +48,23 @@ class SelogerSpider(scrapy.Spider):
|
||||||
for ad in ads:
|
for ad in ads:
|
||||||
infos = ad.css(self.infos_css_sel)
|
infos = ad.css(self.infos_css_sel)
|
||||||
|
|
||||||
|
area = infos.xpath('./li[contains(., "m²")]/text()').get()
|
||||||
|
rooms = infos.xpath('./li[contains(., "p")]/text()').get()
|
||||||
|
bedrooms = infos.xpath('./li[contains(., "ch")]/text()').get()
|
||||||
|
if area is not None:
|
||||||
|
area = area.replace(' m²', '')
|
||||||
|
if rooms is not None:
|
||||||
|
rooms = rooms.replace(' p', '')
|
||||||
|
if bedrooms is not None:
|
||||||
|
bedrooms = bedrooms.replace(' ch', '')
|
||||||
|
|
||||||
yield {
|
yield {
|
||||||
'url': ad.css(self.url_css_sel).get().split('?')[0],
|
'url': ad.css(self.url_css_sel).get().split('?')[0],
|
||||||
'type': ad.css(self.type_css_sel).get(),
|
'type': ad.css(self.type_css_sel).get().replace('/Villa', ''),
|
||||||
'price': ad.css(self.price_css_sel).get().replace('\xa0', ' '),
|
'price': ad.css(self.price_css_sel).get().replace('\xa0', '').replace(' €', ''),
|
||||||
'rooms': infos.xpath('./li[contains(., "p")]/text()').get(),
|
'rooms': rooms,
|
||||||
'bedrooms': infos.xpath('./li[contains(., "ch")]/text()').get(),
|
'bedrooms': bedrooms,
|
||||||
'area': infos.xpath('./li[contains(., "m²")]/text()').get()
|
'area': area
|
||||||
}
|
}
|
||||||
"""
|
"""
|
||||||
pricefloat = float(price.replace(',', '.').replace(' ', '').replace('€', ''))
|
pricefloat = float(price.replace(',', '.').replace(' ', '').replace('€', ''))
|
||||||
|
|
|
@ -1,6 +1,29 @@
|
||||||
import scrapy
|
import scrapy
|
||||||
|
|
||||||
|
|
||||||
|
def get_attributes(attributes):
|
||||||
|
for attribute in attributes:
|
||||||
|
if attribute['key'] == 'real_estate_type':
|
||||||
|
type = attribute['value_label']
|
||||||
|
elif attribute['key'] == 'rooms':
|
||||||
|
rooms = attribute['value']
|
||||||
|
elif attribute['key'] == 'square':
|
||||||
|
area = attribute['value']
|
||||||
|
try:
|
||||||
|
type
|
||||||
|
except NameError:
|
||||||
|
type = None
|
||||||
|
try:
|
||||||
|
rooms
|
||||||
|
except NameError:
|
||||||
|
rooms = None
|
||||||
|
try:
|
||||||
|
area
|
||||||
|
except NameError:
|
||||||
|
area = None
|
||||||
|
return type, rooms, area
|
||||||
|
|
||||||
|
|
||||||
class LeboncoinSpider(scrapy.Spider):
|
class LeboncoinSpider(scrapy.Spider):
|
||||||
name = "leboncoin"
|
name = "leboncoin"
|
||||||
|
|
||||||
|
@ -58,13 +81,16 @@ class LeboncoinSpider(scrapy.Spider):
|
||||||
json = response.json()
|
json = response.json()
|
||||||
ads = json["ads"]
|
ads = json["ads"]
|
||||||
for ad in ads:
|
for ad in ads:
|
||||||
|
attributes = ad['attributes']
|
||||||
|
type, rooms, area = get_attributes(attributes)
|
||||||
|
|
||||||
yield {
|
yield {
|
||||||
'url': ad['url'],
|
'url': ad['url'],
|
||||||
'type': ad['attributes'][0]['value_label'],
|
'type': type,
|
||||||
'price': ad['price'],
|
'price': ad['price'][0],
|
||||||
'rooms': ad['attributes'][2]['value'],
|
'rooms': rooms,
|
||||||
'bedrooms': 'Unknown',
|
'bedrooms': 'Unknown',
|
||||||
'area': ad['attributes'][1]['value']
|
'area': area
|
||||||
}
|
}
|
||||||
|
|
||||||
total_ads_nb = json["total"]
|
total_ads_nb = json["total"]
|
||||||
|
|
Loading…
Reference in New Issue