From 814aea4e669adfcc0cd30762a3c009467793ac91 Mon Sep 17 00:00:00 2001 From: Jordan ERNST Date: Mon, 3 Aug 2020 23:31:38 +0200 Subject: [PATCH] Unify yield values + leboncoin: Fixes --- .gitignore | 1 + ImmoScrap/spiders/PAP.py | 13 ++++++++++--- ImmoScrap/spiders/Seloger.py | 20 +++++++++++++++----- ImmoScrap/spiders/leboncoin.py | 34 ++++++++++++++++++++++++++++++---- 4 files changed, 56 insertions(+), 12 deletions(-) diff --git a/.gitignore b/.gitignore index bee8a64..644fb08 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ __pycache__ +*.csv diff --git a/ImmoScrap/spiders/PAP.py b/ImmoScrap/spiders/PAP.py index c06f212..f0fadbd 100644 --- a/ImmoScrap/spiders/PAP.py +++ b/ImmoScrap/spiders/PAP.py @@ -30,12 +30,19 @@ class PAPSpider(scrapy.Spider): if url[0] != '/': # If url starts with / : on website, else: advertisement break + rooms = infos.xpath('./li[contains(., "pièce")]/text()').get() + bedrooms = infos.xpath('./li[contains(., "chambre")]/text()').get() + if rooms is not None: + rooms = rooms.replace(' pièce', '').replace('s', '') # s, left if plural + if bedrooms is not None: + bedrooms = bedrooms.replace(' chambre', '').replace('s', '') # s, left if plural + yield { 'url': self.baseurl + url, 'type': 'Unknown', - 'price': ad.css(self.price_css_sel).get(), - 'rooms': infos.xpath('./li[contains(., "pièces")]/text()').get(), - 'bedrooms': infos.xpath('./li[contains(., "chambres")]/text()').get(), + 'price': ad.css(self.price_css_sel).get().replace('.', '').replace('\xa0€', ''), + 'rooms': rooms, + 'bedrooms': bedrooms, 'area': infos.xpath('./li[contains(small, "m")]/text()').get() } diff --git a/ImmoScrap/spiders/Seloger.py b/ImmoScrap/spiders/Seloger.py index 7e7d9f7..0b2199b 100644 --- a/ImmoScrap/spiders/Seloger.py +++ b/ImmoScrap/spiders/Seloger.py @@ -48,13 +48,23 @@ class SelogerSpider(scrapy.Spider): for ad in ads: infos = ad.css(self.infos_css_sel) + area = infos.xpath('./li[contains(., "m²")]/text()').get() + rooms = infos.xpath('./li[contains(., "p")]/text()').get() + bedrooms = infos.xpath('./li[contains(., "ch")]/text()').get() + if area is not None: + area = area.replace(' m²', '') + if rooms is not None: + rooms = rooms.replace(' p', '') + if bedrooms is not None: + bedrooms = bedrooms.replace(' ch', '') + yield { 'url': ad.css(self.url_css_sel).get().split('?')[0], - 'type': ad.css(self.type_css_sel).get(), - 'price': ad.css(self.price_css_sel).get().replace('\xa0', ' '), - 'rooms': infos.xpath('./li[contains(., "p")]/text()').get(), - 'bedrooms': infos.xpath('./li[contains(., "ch")]/text()').get(), - 'area': infos.xpath('./li[contains(., "m²")]/text()').get() + 'type': ad.css(self.type_css_sel).get().replace('/Villa', ''), + 'price': ad.css(self.price_css_sel).get().replace('\xa0', '').replace(' €', ''), + 'rooms': rooms, + 'bedrooms': bedrooms, + 'area': area } """ pricefloat = float(price.replace(',', '.').replace(' ', '').replace('€', '')) diff --git a/ImmoScrap/spiders/leboncoin.py b/ImmoScrap/spiders/leboncoin.py index 04d756e..bcfd959 100644 --- a/ImmoScrap/spiders/leboncoin.py +++ b/ImmoScrap/spiders/leboncoin.py @@ -1,6 +1,29 @@ import scrapy +def get_attributes(attributes): + for attribute in attributes: + if attribute['key'] == 'real_estate_type': + type = attribute['value_label'] + elif attribute['key'] == 'rooms': + rooms = attribute['value'] + elif attribute['key'] == 'square': + area = attribute['value'] + try: + type + except NameError: + type = None + try: + rooms + except NameError: + rooms = None + try: + area + except NameError: + area = None + return type, rooms, area + + class LeboncoinSpider(scrapy.Spider): name = "leboncoin" @@ -58,13 +81,16 @@ class LeboncoinSpider(scrapy.Spider): json = response.json() ads = json["ads"] for ad in ads: + attributes = ad['attributes'] + type, rooms, area = get_attributes(attributes) + yield { 'url': ad['url'], - 'type': ad['attributes'][0]['value_label'], - 'price': ad['price'], - 'rooms': ad['attributes'][2]['value'], + 'type': type, + 'price': ad['price'][0], + 'rooms': rooms, 'bedrooms': 'Unknown', - 'area': ad['attributes'][1]['value'] + 'area': area } total_ads_nb = json["total"]