diff --git a/ImmoScrap/pipelines.py b/ImmoScrap/pipelines.py index d55da0f..de123b6 100644 --- a/ImmoScrap/pipelines.py +++ b/ImmoScrap/pipelines.py @@ -5,9 +5,16 @@ # useful for handling different item types with a single interface -from itemadapter import ItemAdapter +# from itemadapter import ItemAdapter -class ImmoscrapPipeline: +class PricePerSqmPipeline: def process_item(self, item, spider): + ''' + pricefloat = float(price.replace(',', '.').replace(' ', '').replace('€', '')) + areafloat = float(area.replace(',', '.').replace(' ', '').replace('m²', '')) + pricesqmint = round(pricefloat / areafloat) + pricepersqm = float(item['price']) / float(item['area']) + item['price/sqm'] = pricepersqm + ''' return item diff --git a/ImmoScrap/settings.py b/ImmoScrap/settings.py index 11d0ea9..40b4024 100644 --- a/ImmoScrap/settings.py +++ b/ImmoScrap/settings.py @@ -14,8 +14,8 @@ NEWSPIDER_MODULE = 'ImmoScrap.spiders' LOG_LEVEL = 'WARNING' FEEDS = { - 'export.csv': { - 'format': 'csv' + 'export.json': { + 'format': 'json' } } @@ -80,9 +80,9 @@ EXTENSIONS = { # Configure item pipelines # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html -# ITEM_PIPELINES = { -# 'ImmoScrap.pipelines.ImmoscrapPipeline': 300, -# } +ITEM_PIPELINES = { + 'ImmoScrap.pipelines.PricePerSqmPipeline': 300, +} # Enable and configure the AutoThrottle extension (disabled by default) # See https://docs.scrapy.org/en/latest/topics/autothrottle.html diff --git a/ImmoScrap/spiders/Seloger.py b/ImmoScrap/spiders/Seloger.py index 8b57425..dbfcab5 100644 --- a/ImmoScrap/spiders/Seloger.py +++ b/ImmoScrap/spiders/Seloger.py @@ -62,12 +62,6 @@ class SelogerSpider(scrapy.Spider): 'bedrooms': bedrooms, 'area': area } - """ - pricefloat = float(price.replace(',', '.').replace(' ', '').replace('€', '')) - areafloat = float(area.replace(',', '.').replace(' ', '').replace('m²', '')) - pricesqmint = round(pricefloat / areafloat) - print(url, type, price, rooms, bedrooms, area, pricesqmint) - """ active_page = response.url active_page_nb = int(active_page.split('LISTING-LISTpg=')[1])