import scrapy from urllib.parse import urlencode # from math import ceil # WRONG : The only requrement to bypass protections on seloger is to spoof User-Agent # Let's rotate User-Agents class SelogerSpider(scrapy.Spider): name = "seloger" baseurl = 'https://www.seloger.com/list.htm' vienne_area = r'uyztGgtx\zLeTdNcPvb@iJpFi@tDLdHtAbNxI|LvJx`@zi@|@bB|Eld@p@jNg@xn@gApPwBzR_FlVcBhEkDdBaDZgLeB{Aw@mwBi}BmBwAaEwEeAsBI}CxNij@' stetienne_area = r'}mktGwtwYp@aClDsF`CaHpA_HfFgFlPkIjc@{HjDsBtK?dm@eKxIx@bCbBtDdKRpC}@`GTjIgArGcC`G{@jEyH`L{AbGcCrBmBjDuQ~L_EjDsFvA{G|C_Rx@mBj@eHh@gTN}Ey@oHoC_KkNuE}Cg@mDScGReK' params = {'projects': '2,5', 'types': '1,2,12,11', 'natures': '1', 'price': 'NaN/300000', 'sort': 'd_dt_crea', 'enterprise': 0, 'qsVersion': 1.0 } vienne_params = params.copy() vienne_params['searchareas'] = vienne_area vienne_params['LISTING-LISTpg'] = 1 # From python 3.7 dict keep insertion order. We wand the page to be the last param stetienne_params = params.copy() stetienne_params['searchareas'] = stetienne_area stetienne_params['LISTING-LISTpg'] = 1 # From python 3.7 dict keep insertion order. We wand the page to be the last param ads_css_sel = '.ListContent__SmartClassifiedExtended-sc-1viyr2k-2' url_css_sel = '.dXJclF::attr(href)' type_css_sel = '.joPkKZ::text' price_css_sel = '.mVWFG::text' infos_css_sel = ".eJYQQA" # Contains Rooms, bedrooms, area pagination_xpath = '//div[has-class("ckWPHD")]//text()' start_urls = [ f'{baseurl}?{urlencode(vienne_params)}', f'{baseurl}?{urlencode(stetienne_params)}' ] def parse(self, response): ads = response.css(self.ads_css_sel) for ad in ads: infos = ad.css(self.infos_css_sel) area = infos.xpath('./li[contains(., "m²")]/text()').get() rooms = infos.xpath('./li[contains(., "p")]/text()').get() bedrooms = infos.xpath('./li[contains(., "ch")]/text()').get() if area is not None: area = area.replace(' m²', '').replace(',', '.') if rooms is not None: rooms = rooms.replace(' p', '') if bedrooms is not None: bedrooms = bedrooms.replace(' ch', '') yield { 'url': ad.css(self.url_css_sel).get().split('?')[0], 'type': ad.css(self.type_css_sel).get().replace('/Villa', ''), 'price': int(ad.css(self.price_css_sel).get().replace('\xa0', '').replace(' €', '')), 'rooms': rooms, 'bedrooms': bedrooms, 'area': area } active_page = response.url active_page_nb = int(active_page.split('LISTING-LISTpg=')[1]) current_ads_nb = response.xpath(self.pagination_xpath).getall()[4] total_ads_nb = response.xpath(self.pagination_xpath).getall()[-1] if current_ads_nb != total_ads_nb: # If not last page next_page_nb = active_page_nb + 1 next_page = active_page.split('LISTING-LISTpg=')[0] + f'LISTING-LISTpg={next_page_nb}' yield response.follow(next_page, callback=self.parse)