85 lines
3.8 KiB
Python
85 lines
3.8 KiB
Python
import scrapy
|
|
from urllib.parse import urlencode
|
|
# from math import ceil
|
|
|
|
# The only requrement to bypass protections on seloger is to spoof User-Agent
|
|
|
|
|
|
class SelogerSpider(scrapy.Spider):
|
|
name = "seloger"
|
|
|
|
baseurl = 'https://www.seloger.com/list.htm'
|
|
vienne_area = r'uyztGgtx\zLeTdNcPvb@iJpFi@tDLdHtAbNxI|LvJx`@zi@|@bB|Eld@p@jNg@xn@gApPwBzR_FlVcBhEkDdBaDZgLeB{Aw@mwBi}BmBwAaEwEeAsBI}CxNij@'
|
|
stetienne_area = r'}mktGwtwYp@aClDsF`CaHpA_HfFgFlPkIjc@{HjDsBtK?dm@eKxIx@bCbBtDdKRpC}@`GTjIgArGcC`G{@jEyH`L{AbGcCrBmBjDuQ~L_EjDsFvA{G|C_Rx@mBj@eHh@gTN}Ey@oHoC_KkNuE}Cg@mDScGReK'
|
|
params = {'projects': '2,5',
|
|
'types': '1,2,12,11',
|
|
'natures': '1',
|
|
'price': 'NaN/300000',
|
|
'sort': 'd_dt_crea',
|
|
'enterprise': 0,
|
|
'qsVersion': 1.0
|
|
}
|
|
vienne_params = params.copy()
|
|
vienne_params['searchareas'] = vienne_area
|
|
vienne_params['LISTING-LISTpg'] = 1 # From python 3.7 dict keep insertion order. We wand the page to be the last param
|
|
stetienne_params = params.copy()
|
|
stetienne_params['searchareas'] = stetienne_area
|
|
stetienne_params['LISTING-LISTpg'] = 1 # From python 3.7 dict keep insertion order. We wand the page to be the last param
|
|
|
|
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:78.0) Gecko/20100101 Firefox/78.0'}
|
|
|
|
ads_css_sel = '.ListContent__SmartClassifiedExtended-sc-1viyr2k-2'
|
|
url_css_sel = '.dXJclF::attr(href)'
|
|
type_css_sel = '.joPkKZ::text'
|
|
price_css_sel = '.mVWFG::text'
|
|
infos_css_sel = ".eJYQQA" # Contains Rooms, bedrooms, area
|
|
pagination_xpath = '//div[has-class("ckWPHD")]//text()'
|
|
|
|
def start_requests(self):
|
|
urls = [
|
|
f'{self.baseurl}?{urlencode(self.vienne_params)}',
|
|
f'{self.baseurl}?{urlencode(self.stetienne_params)}'
|
|
]
|
|
for url in urls:
|
|
yield scrapy.Request(url=url, headers=self.headers, callback=self.parse)
|
|
|
|
def parse(self, response):
|
|
ads = response.css(self.ads_css_sel)
|
|
for ad in ads:
|
|
infos = ad.css(self.infos_css_sel)
|
|
|
|
area = infos.xpath('./li[contains(., "m²")]/text()').get()
|
|
rooms = infos.xpath('./li[contains(., "p")]/text()').get()
|
|
bedrooms = infos.xpath('./li[contains(., "ch")]/text()').get()
|
|
if area is not None:
|
|
area = area.replace(' m²', '')
|
|
if rooms is not None:
|
|
rooms = rooms.replace(' p', '')
|
|
if bedrooms is not None:
|
|
bedrooms = bedrooms.replace(' ch', '')
|
|
|
|
yield {
|
|
'url': ad.css(self.url_css_sel).get().split('?')[0],
|
|
'type': ad.css(self.type_css_sel).get().replace('/Villa', ''),
|
|
'price': ad.css(self.price_css_sel).get().replace('\xa0', '').replace(' €', ''),
|
|
'rooms': rooms,
|
|
'bedrooms': bedrooms,
|
|
'area': area
|
|
}
|
|
"""
|
|
pricefloat = float(price.replace(',', '.').replace(' ', '').replace('€', ''))
|
|
areafloat = float(area.replace(',', '.').replace(' ', '').replace('m²', ''))
|
|
pricesqmint = round(pricefloat / areafloat)
|
|
print(url, type, price, rooms, bedrooms, area, pricesqmint)
|
|
"""
|
|
active_page = response.url
|
|
active_page_nb = int(active_page.split('LISTING-LISTpg=')[1])
|
|
|
|
current_ads_nb = response.xpath(self.pagination_xpath).getall()[4]
|
|
total_ads_nb = response.xpath(self.pagination_xpath).getall()[-1]
|
|
|
|
if current_ads_nb != total_ads_nb: # If not last page
|
|
next_page_nb = active_page_nb + 1
|
|
next_page = active_page.split('LISTING-LISTpg=')[0] + f'LISTING-LISTpg={next_page_nb}'
|
|
yield response.follow(next_page, headers=self.headers, callback=self.parse)
|