PricePerSqm Pipeline

WIP: pipelines, code cleanup, export now to JSON
Adding CSV export
2020-08-16 18:55:03 +02:00 · 2020-08-16 18:10:59 +02:00 · 2020-08-16 17:33:24 +02:00
5 changed files with 20 additions and 16 deletions
--- a/ImmoScrap/pipelines.py
+++ b/ImmoScrap/pipelines.py
@ -5,9 +5,13 @@


 # useful for handling different item types with a single interface
-from itemadapter import ItemAdapter
+# from itemadapter import ItemAdapter


-class ImmoscrapPipeline:
+class PricePerSqmPipeline:
    def process_item(self, item, spider):
+        area = item['area']
+        if area is not None:
+            pricepersqm = item['price'] / float(item['area'])
+            item['price/sqm'] = round(pricepersqm)
        return item
--- a/ImmoScrap/settings.py
+++ b/ImmoScrap/settings.py
@ -13,6 +13,12 @@ SPIDER_MODULES = ['ImmoScrap.spiders']
 NEWSPIDER_MODULE = 'ImmoScrap.spiders'
 LOG_LEVEL = 'WARNING'

+FEEDS = {
+    'export.json': {
+        'format': 'json'
+    }
+}
+

 # Crawl responsibly by identifying yourself (and your website) on the user-agent
 # USER_AGENT = 'ImmoScrap (+http://www.yourdomain.com)'
@ -74,9 +80,9 @@ EXTENSIONS = {

 # Configure item pipelines
 # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
-# ITEM_PIPELINES = {
-#    'ImmoScrap.pipelines.ImmoscrapPipeline': 300,
-# }
+ITEM_PIPELINES = {
+    'ImmoScrap.pipelines.PricePerSqmPipeline': 300,
+}

 # Enable and configure the AutoThrottle extension (disabled by default)
 # See https://docs.scrapy.org/en/latest/topics/autothrottle.html
--- a/ImmoScrap/spiders/PAP.py
+++ b/ImmoScrap/spiders/PAP.py
@ -37,10 +37,10 @@ class PAPSpider(scrapy.Spider):
            yield {
                'url': self.baseurl + url,
                'type': 'Unknown',
-                'price': ad.css(self.price_css_sel).get().replace('.', '').replace('\xa0€', ''),
+                'price': int(ad.css(self.price_css_sel).get().replace('.', '').replace('\xa0€', '')),
                'rooms': rooms,
                'bedrooms': bedrooms,
-                'area': infos.xpath('./li[contains(small, "m")]/text()').get()
+                'area': infos.xpath('./li[contains(small, "m")]/text()').get().replace(' ', '')
            }

        next_page = response.xpath(self.next_page_xpath).get()
--- a/ImmoScrap/spiders/Seloger.py
+++ b/ImmoScrap/spiders/Seloger.py
@ -57,17 +57,11 @@ class SelogerSpider(scrapy.Spider):
            yield {
                'url': ad.css(self.url_css_sel).get().split('?')[0],
                'type': ad.css(self.type_css_sel).get().replace('/Villa', ''),
-                'price': ad.css(self.price_css_sel).get().replace('\xa0', '').replace(' €', ''),
+                'price': int(ad.css(self.price_css_sel).get().replace('\xa0', '').replace(' €', '')),
                'rooms': rooms,
                'bedrooms': bedrooms,
-                'area': area
+                'area': area.replace(',', '.')
            }
-            """
-            pricefloat = float(price.replace(',', '.').replace(' ', '').replace('€', ''))
-            areafloat = float(area.replace(',', '.').replace(' ', '').replace('m²', ''))
-            pricesqmint = round(pricefloat / areafloat)
-            print(url, type, price, rooms, bedrooms, area, pricesqmint)
-            """
        active_page = response.url
        active_page_nb = int(active_page.split('LISTING-LISTpg=')[1])

--- a/ImmoScrap/spiders/leboncoin.py
+++ b/ImmoScrap/spiders/leboncoin.py
@ -87,7 +87,7 @@ class LeboncoinSpider(scrapy.Spider):
            yield {
                'url': ad['url'],
                'type': type,
-                'price': ad['price'][0],
+                'price': int(ad['price'][0]),
                'rooms': rooms,
                'bedrooms': 'Unknown',
                'area': area
Author	SHA1	Message	Date
Jordan ERNST	ec7acf9308	PricePerSqm Pipeline	2020-08-16 18:55:03 +02:00
Jordan ERNST	828035e166	WIP: pipelines, code cleanup, export now to JSON	2020-08-16 18:10:59 +02:00
Jordan ERNST	34e577a794	Adding CSV export	2020-08-16 17:33:24 +02:00