WIP: pipelines, code cleanup, export now to JSON

This commit is contained in:
Jordan ERNST 2020-08-16 18:10:59 +02:00
parent 34e577a794
commit 828035e166
3 changed files with 14 additions and 13 deletions

View File

@ -5,9 +5,16 @@
# useful for handling different item types with a single interface # useful for handling different item types with a single interface
from itemadapter import ItemAdapter # from itemadapter import ItemAdapter
class ImmoscrapPipeline: class PricePerSqmPipeline:
def process_item(self, item, spider): def process_item(self, item, spider):
'''
pricefloat = float(price.replace(',', '.').replace(' ', '').replace('', ''))
areafloat = float(area.replace(',', '.').replace(' ', '').replace('', ''))
pricesqmint = round(pricefloat / areafloat)
pricepersqm = float(item['price']) / float(item['area'])
item['price/sqm'] = pricepersqm
'''
return item return item

View File

@ -14,8 +14,8 @@ NEWSPIDER_MODULE = 'ImmoScrap.spiders'
LOG_LEVEL = 'WARNING' LOG_LEVEL = 'WARNING'
FEEDS = { FEEDS = {
'export.csv': { 'export.json': {
'format': 'csv' 'format': 'json'
} }
} }
@ -80,9 +80,9 @@ EXTENSIONS = {
# Configure item pipelines # Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# ITEM_PIPELINES = { ITEM_PIPELINES = {
# 'ImmoScrap.pipelines.ImmoscrapPipeline': 300, 'ImmoScrap.pipelines.PricePerSqmPipeline': 300,
# } }
# Enable and configure the AutoThrottle extension (disabled by default) # Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html # See https://docs.scrapy.org/en/latest/topics/autothrottle.html

View File

@ -62,12 +62,6 @@ class SelogerSpider(scrapy.Spider):
'bedrooms': bedrooms, 'bedrooms': bedrooms,
'area': area 'area': area
} }
"""
pricefloat = float(price.replace(',', '.').replace(' ', '').replace('', ''))
areafloat = float(area.replace(',', '.').replace(' ', '').replace('', ''))
pricesqmint = round(pricefloat / areafloat)
print(url, type, price, rooms, bedrooms, area, pricesqmint)
"""
active_page = response.url active_page = response.url
active_page_nb = int(active_page.split('LISTING-LISTpg=')[1]) active_page_nb = int(active_page.split('LISTING-LISTpg=')[1])