Unify yield values + leboncoin: Fixes

2020-08-03 23:31:38 +02:00 · 2020-08-03 23:31:38 +02:00 · 814aea4e66
commit 814aea4e66
parent 7e58f445a5
4 changed files with 56 additions and 12 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1 +1,2 @@
 __pycache__
+*.csv
--- a/ImmoScrap/spiders/PAP.py
+++ b/ImmoScrap/spiders/PAP.py
@ -30,12 +30,19 @@ class PAPSpider(scrapy.Spider):
            if url[0] != '/':    # If url starts with / : on website, else: advertisement
                break

+            rooms = infos.xpath('./li[contains(., "pièce")]/text()').get()
+            bedrooms = infos.xpath('./li[contains(., "chambre")]/text()').get()
+            if rooms is not None:
+                rooms = rooms.replace(' pièce', '').replace('s', '')  # s, left if plural
+            if bedrooms is not None:
+                bedrooms = bedrooms.replace(' chambre', '').replace('s', '')  # s, left if plural
+
            yield {
                'url': self.baseurl + url,
                'type': 'Unknown',
-                'price': ad.css(self.price_css_sel).get(),
-                'rooms': infos.xpath('./li[contains(., "pièces")]/text()').get(),
-                'bedrooms': infos.xpath('./li[contains(., "chambres")]/text()').get(),
+                'price': ad.css(self.price_css_sel).get().replace('.', '').replace('\xa0€', ''),
+                'rooms': rooms,
+                'bedrooms': bedrooms,
                'area': infos.xpath('./li[contains(small, "m")]/text()').get()
            }

--- a/ImmoScrap/spiders/Seloger.py
+++ b/ImmoScrap/spiders/Seloger.py
@ -48,13 +48,23 @@ class SelogerSpider(scrapy.Spider):
        for ad in ads:
            infos = ad.css(self.infos_css_sel)

+            area = infos.xpath('./li[contains(., "m²")]/text()').get()
+            rooms = infos.xpath('./li[contains(., "p")]/text()').get()
+            bedrooms = infos.xpath('./li[contains(., "ch")]/text()').get()
+            if area is not None:
+                area = area.replace(' m²', '')
+            if rooms is not None:
+                rooms = rooms.replace(' p', '')
+            if bedrooms is not None:
+                bedrooms = bedrooms.replace(' ch', '')
+
            yield {
                'url': ad.css(self.url_css_sel).get().split('?')[0],
-                'type': ad.css(self.type_css_sel).get(),
-                'price': ad.css(self.price_css_sel).get().replace('\xa0', ' '),
-                'rooms': infos.xpath('./li[contains(., "p")]/text()').get(),
-                'bedrooms': infos.xpath('./li[contains(., "ch")]/text()').get(),
-                'area': infos.xpath('./li[contains(., "m²")]/text()').get()
+                'type': ad.css(self.type_css_sel).get().replace('/Villa', ''),
+                'price': ad.css(self.price_css_sel).get().replace('\xa0', '').replace(' €', ''),
+                'rooms': rooms,
+                'bedrooms': bedrooms,
+                'area': area
            }
            """
            pricefloat = float(price.replace(',', '.').replace(' ', '').replace('€', ''))
--- a/ImmoScrap/spiders/leboncoin.py
+++ b/ImmoScrap/spiders/leboncoin.py
@ -1,6 +1,29 @@
 import scrapy


+def get_attributes(attributes):
+    for attribute in attributes:
+        if attribute['key'] == 'real_estate_type':
+            type = attribute['value_label']
+        elif attribute['key'] == 'rooms':
+            rooms = attribute['value']
+        elif attribute['key'] == 'square':
+            area = attribute['value']
+        try:
+            type
+        except NameError:
+            type = None
+        try:
+            rooms
+        except NameError:
+            rooms = None
+        try:
+            area
+        except NameError:
+            area = None
+    return type, rooms, area
+
+
 class LeboncoinSpider(scrapy.Spider):
    name = "leboncoin"

@ -58,13 +81,16 @@ class LeboncoinSpider(scrapy.Spider):
        json = response.json()
        ads = json["ads"]
        for ad in ads:
+            attributes = ad['attributes']
+            type, rooms, area = get_attributes(attributes)
+
            yield {
                'url': ad['url'],
-                'type': ad['attributes'][0]['value_label'],
-                'price': ad['price'],
-                'rooms': ad['attributes'][2]['value'],
+                'type': type,
+                'price': ad['price'][0],
+                'rooms': rooms,
                'bedrooms': 'Unknown',
-                'area': ad['attributes'][1]['value']
+                'area': area
            }

        total_ads_nb = json["total"]