Compare commits

...

3 Commits

Author SHA1 Message Date
Jordan ERNST ec7acf9308 PricePerSqm Pipeline 2020-08-16 18:55:03 +02:00
Jordan ERNST 828035e166 WIP: pipelines, code cleanup, export now to JSON 2020-08-16 18:10:59 +02:00
Jordan ERNST 34e577a794 Adding CSV export 2020-08-16 17:33:24 +02:00
5 changed files with 20 additions and 16 deletions

View File

@ -5,9 +5,13 @@
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
# from itemadapter import ItemAdapter
class ImmoscrapPipeline:
class PricePerSqmPipeline:
def process_item(self, item, spider):
area = item['area']
if area is not None:
pricepersqm = item['price'] / float(item['area'])
item['price/sqm'] = round(pricepersqm)
return item

View File

@ -13,6 +13,12 @@ SPIDER_MODULES = ['ImmoScrap.spiders']
NEWSPIDER_MODULE = 'ImmoScrap.spiders'
LOG_LEVEL = 'WARNING'
FEEDS = {
'export.json': {
'format': 'json'
}
}
# Crawl responsibly by identifying yourself (and your website) on the user-agent
# USER_AGENT = 'ImmoScrap (+http://www.yourdomain.com)'
@ -74,9 +80,9 @@ EXTENSIONS = {
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# ITEM_PIPELINES = {
# 'ImmoScrap.pipelines.ImmoscrapPipeline': 300,
# }
ITEM_PIPELINES = {
'ImmoScrap.pipelines.PricePerSqmPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html

View File

@ -37,10 +37,10 @@ class PAPSpider(scrapy.Spider):
yield {
'url': self.baseurl + url,
'type': 'Unknown',
'price': ad.css(self.price_css_sel).get().replace('.', '').replace('\xa0', ''),
'price': int(ad.css(self.price_css_sel).get().replace('.', '').replace('\xa0', '')),
'rooms': rooms,
'bedrooms': bedrooms,
'area': infos.xpath('./li[contains(small, "m")]/text()').get()
'area': infos.xpath('./li[contains(small, "m")]/text()').get().replace(' ', '')
}
next_page = response.xpath(self.next_page_xpath).get()

View File

@ -57,17 +57,11 @@ class SelogerSpider(scrapy.Spider):
yield {
'url': ad.css(self.url_css_sel).get().split('?')[0],
'type': ad.css(self.type_css_sel).get().replace('/Villa', ''),
'price': ad.css(self.price_css_sel).get().replace('\xa0', '').replace('', ''),
'price': int(ad.css(self.price_css_sel).get().replace('\xa0', '').replace('', '')),
'rooms': rooms,
'bedrooms': bedrooms,
'area': area
'area': area.replace(',', '.')
}
"""
pricefloat = float(price.replace(',', '.').replace(' ', '').replace('', ''))
areafloat = float(area.replace(',', '.').replace(' ', '').replace('', ''))
pricesqmint = round(pricefloat / areafloat)
print(url, type, price, rooms, bedrooms, area, pricesqmint)
"""
active_page = response.url
active_page_nb = int(active_page.split('LISTING-LISTpg=')[1])

View File

@ -87,7 +87,7 @@ class LeboncoinSpider(scrapy.Spider):
yield {
'url': ad['url'],
'type': type,
'price': ad['price'][0],
'price': int(ad['price'][0]),
'rooms': rooms,
'bedrooms': 'Unknown',
'area': area