ImmoScrap/ImmoScrap/spiders/Seloger.py

75 lines
3.2 KiB
Python
Raw Normal View History

2020-07-02 08:40:13 +02:00
import scrapy
from urllib.parse import urlencode
# from math import ceil
2020-08-16 01:13:20 +02:00
# WRONG : The only requrement to bypass protections on seloger is to spoof User-Agent
# Let's rotate User-Agents
2020-07-02 08:40:13 +02:00
class SelogerSpider(scrapy.Spider):
name = "seloger"
baseurl = 'https://www.seloger.com/list.htm'
vienne_area = r'uyztGgtx\zLeTdNcPvb@iJpFi@tDLdHtAbNxI|LvJx`@zi@|@bB|Eld@p@jNg@xn@gApPwBzR_FlVcBhEkDdBaDZgLeB{Aw@mwBi}BmBwAaEwEeAsBI}CxNij@'
stetienne_area = r'}mktGwtwYp@aClDsF`CaHpA_HfFgFlPkIjc@{HjDsBtK?dm@eKxIx@bCbBtDdKRpC}@`GTjIgArGcC`G{@jEyH`L{AbGcCrBmBjDuQ~L_EjDsFvA{G|C_Rx@mBj@eHh@gTN}Ey@oHoC_KkNuE}Cg@mDScGReK'
params = {'projects': '2,5',
'types': '1,2,12,11',
'natures': '1',
'price': 'NaN/300000',
'sort': 'd_dt_crea',
'enterprise': 0,
2020-07-02 10:28:26 +02:00
'qsVersion': 1.0
}
2020-07-02 08:40:13 +02:00
vienne_params = params.copy()
vienne_params['searchareas'] = vienne_area
2020-07-02 10:28:26 +02:00
vienne_params['LISTING-LISTpg'] = 1 # From python 3.7 dict keep insertion order. We wand the page to be the last param
2020-07-02 08:40:13 +02:00
stetienne_params = params.copy()
stetienne_params['searchareas'] = stetienne_area
2020-07-02 10:28:26 +02:00
stetienne_params['LISTING-LISTpg'] = 1 # From python 3.7 dict keep insertion order. We wand the page to be the last param
2020-07-02 08:40:13 +02:00
ads_css_sel = '.ListContent__SmartClassifiedExtended-sc-1viyr2k-2'
url_css_sel = '.dXJclF::attr(href)'
type_css_sel = '.joPkKZ::text'
price_css_sel = '.mVWFG::text'
infos_css_sel = ".eJYQQA" # Contains Rooms, bedrooms, area
2020-07-02 13:02:05 +02:00
pagination_xpath = '//div[has-class("ckWPHD")]//text()'
2020-07-02 08:40:13 +02:00
start_urls = [
f'{baseurl}?{urlencode(vienne_params)}',
f'{baseurl}?{urlencode(stetienne_params)}'
]
2020-07-02 08:40:13 +02:00
def parse(self, response):
ads = response.css(self.ads_css_sel)
for ad in ads:
infos = ad.css(self.infos_css_sel)
2020-08-03 23:31:38 +02:00
area = infos.xpath('./li[contains(., "")]/text()').get()
rooms = infos.xpath('./li[contains(., "p")]/text()').get()
bedrooms = infos.xpath('./li[contains(., "ch")]/text()').get()
if area is not None:
2021-12-12 21:58:25 +01:00
area = area.replace('', '').replace(',', '.')
2020-08-03 23:31:38 +02:00
if rooms is not None:
rooms = rooms.replace(' p', '')
if bedrooms is not None:
bedrooms = bedrooms.replace(' ch', '')
2020-07-02 08:40:13 +02:00
yield {
'url': ad.css(self.url_css_sel).get().split('?')[0],
2020-08-03 23:31:38 +02:00
'type': ad.css(self.type_css_sel).get().replace('/Villa', ''),
2020-08-16 18:55:03 +02:00
'price': int(ad.css(self.price_css_sel).get().replace('\xa0', '').replace('', '')),
2020-08-03 23:31:38 +02:00
'rooms': rooms,
'bedrooms': bedrooms,
2021-12-12 21:58:25 +01:00
'area': area
2020-07-02 08:40:13 +02:00
}
active_page = response.url
active_page_nb = int(active_page.split('LISTING-LISTpg=')[1])
current_ads_nb = response.xpath(self.pagination_xpath).getall()[4]
total_ads_nb = response.xpath(self.pagination_xpath).getall()[-1]
if current_ads_nb != total_ads_nb: # If not last page
next_page_nb = active_page_nb + 1
next_page = active_page.split('LISTING-LISTpg=')[0] + f'LISTING-LISTpg={next_page_nb}'
2020-08-16 01:13:20 +02:00
yield response.follow(next_page, callback=self.parse)