ImmoScrap/ImmoScrap/spiders/Seloger.py

import scrapy
from urllib.parse import urlencode
# from math import ceil

# WRONG : The only requrement to bypass protections on seloger is to spoof User-Agent
# Let's rotate User-Agents


class SelogerSpider(scrapy.Spider):
    name = "seloger"

    baseurl = 'https://www.seloger.com/list.htm'
    vienne_area = r'uyztGgtx\zLeTdNcPvb@iJpFi@tDLdHtAbNxI|LvJx`@zi@|@bB|Eld@p@jNg@xn@gApPwBzR_FlVcBhEkDdBaDZgLeB{Aw@mwBi}BmBwAaEwEeAsBI}CxNij@'
    stetienne_area = r'}mktGwtwYp@aClDsF`CaHpA_HfFgFlPkIjc@{HjDsBtK?dm@eKxIx@bCbBtDdKRpC}@`GTjIgArGcC`G{@jEyH`L{AbGcCrBmBjDuQ~L_EjDsFvA{G|C_Rx@mBj@eHh@gTN}Ey@oHoC_KkNuE}Cg@mDScGReK'
    params = {'projects': '2,5',
              'types': '1,2,12,11',
              'natures': '1',
              'price': 'NaN/300000',
              'sort': 'd_dt_crea',
              'enterprise': 0,
              'qsVersion': 1.0
              }
    vienne_params = params.copy()
    vienne_params['searchareas'] = vienne_area
    vienne_params['LISTING-LISTpg'] = 1                # From python 3.7 dict keep insertion order. We wand the page to be the last param
    stetienne_params = params.copy()
    stetienne_params['searchareas'] = stetienne_area
    stetienne_params['LISTING-LISTpg'] = 1             # From python 3.7 dict keep insertion order. We wand the page to be the last param

    ads_css_sel = '.ListContent__SmartClassifiedExtended-sc-1viyr2k-2'
    url_css_sel = '.dXJclF::attr(href)'
    type_css_sel = '.joPkKZ::text'
    price_css_sel = '.mVWFG::text'
    infos_css_sel = ".eJYQQA"    # Contains Rooms, bedrooms, area
    pagination_xpath = '//div[has-class("ckWPHD")]//text()'

    start_urls = [
        f'{baseurl}?{urlencode(vienne_params)}',
        f'{baseurl}?{urlencode(stetienne_params)}'
    ]

    def parse(self, response):
        ads = response.css(self.ads_css_sel)
        for ad in ads:
            infos = ad.css(self.infos_css_sel)

            area = infos.xpath('./li[contains(., "m²")]/text()').get()
            rooms = infos.xpath('./li[contains(., "p")]/text()').get()
            bedrooms = infos.xpath('./li[contains(., "ch")]/text()').get()
            if area is not None:
                area = area.replace(' m²', '').replace(',', '.')
            if rooms is not None:
                rooms = rooms.replace(' p', '')
            if bedrooms is not None:
                bedrooms = bedrooms.replace(' ch', '')

            yield {
                'url': ad.css(self.url_css_sel).get().split('?')[0],
                'type': ad.css(self.type_css_sel).get().replace('/Villa', ''),
                'price': int(ad.css(self.price_css_sel).get().replace('\xa0', '').replace(' €', '')),
                'rooms': rooms,
                'bedrooms': bedrooms,
                'area': area
            }
        active_page = response.url
        active_page_nb = int(active_page.split('LISTING-LISTpg=')[1])

        current_ads_nb = response.xpath(self.pagination_xpath).getall()[4]
        total_ads_nb = response.xpath(self.pagination_xpath).getall()[-1]

        if current_ads_nb != total_ads_nb:    # If not last page
            next_page_nb = active_page_nb + 1
            next_page = active_page.split('LISTING-LISTpg=')[0] + f'LISTING-LISTpg={next_page_nb}'
            yield response.follow(next_page, callback=self.parse)
Initial commit 2020-07-02 08:40:13 +02:00			`import scrapy`
			`from urllib.parse import urlencode`
			`# from math import ceil`

Added User-Agent rotation 2020-08-16 01:13:20 +02:00			`# WRONG : The only requrement to bypass protections on seloger is to spoof User-Agent`
			`# Let's rotate User-Agents`
Initial commit 2020-07-02 08:40:13 +02:00

			`class SelogerSpider(scrapy.Spider):`
			`name = "seloger"`

			`baseurl = 'https://www.seloger.com/list.htm'`
			vienne_area = r'uyztGgtx\zLeTdNcPvb@iJpFi@tDLdHtAbNxI\|LvJx`@zi@\|@bB\|Eld@p@jNg@xn@gApPwBzR_FlVcBhEkDdBaDZgLeB{Aw@mwBi}BmBwAaEwEeAsBI}CxNij@'
			stetienne_area = r'}mktGwtwYp@aClDsF`CaHpA_HfFgFlPkIjc@{HjDsBtK?dm@eKxIx@bCbBtDdKRpC}@`GTjIgArGcC`G{@jEyH`L{AbGcCrBmBjDuQ~L_EjDsFvA{G\|C_Rx@mBj@eHh@gTN}Ey@oHoC_KkNuE}Cg@mDScGReK'
			`params = {'projects': '2,5',`
			`'types': '1,2,12,11',`
			`'natures': '1',`
			`'price': 'NaN/300000',`
			`'sort': 'd_dt_crea',`
			`'enterprise': 0,`
Fixed 2020-07-02 10:28:26 +02:00			`'qsVersion': 1.0`
			`}`
Initial commit 2020-07-02 08:40:13 +02:00			`vienne_params = params.copy()`
			`vienne_params['searchareas'] = vienne_area`
Fixed 2020-07-02 10:28:26 +02:00			`vienne_params['LISTING-LISTpg'] = 1 # From python 3.7 dict keep insertion order. We wand the page to be the last param`
Initial commit 2020-07-02 08:40:13 +02:00			`stetienne_params = params.copy()`
			`stetienne_params['searchareas'] = stetienne_area`
Fixed 2020-07-02 10:28:26 +02:00			`stetienne_params['LISTING-LISTpg'] = 1 # From python 3.7 dict keep insertion order. We wand the page to be the last param`
Initial commit 2020-07-02 08:40:13 +02:00
			`ads_css_sel = '.ListContent__SmartClassifiedExtended-sc-1viyr2k-2'`
			`url_css_sel = '.dXJclF::attr(href)'`
			`type_css_sel = '.joPkKZ::text'`
			`price_css_sel = '.mVWFG::text'`
			`infos_css_sel = ".eJYQQA" # Contains Rooms, bedrooms, area`
Added PAP spider 2020-07-02 13:02:05 +02:00			`pagination_xpath = '//div[has-class("ckWPHD")]//text()'`
Initial commit 2020-07-02 08:40:13 +02:00
Simplify start_requests: https://docs.scrapy.org/en/latest/intro/tutorial.html#a-shortcut-to-the-start-requests-method 2020-08-11 12:18:24 +02:00			`start_urls = [`
			`f'{baseurl}?{urlencode(vienne_params)}',`
			`f'{baseurl}?{urlencode(stetienne_params)}'`
			`]`
Initial commit 2020-07-02 08:40:13 +02:00
			`def parse(self, response):`
			`ads = response.css(self.ads_css_sel)`
			`for ad in ads:`
			`infos = ad.css(self.infos_css_sel)`

Unify yield values + leboncoin: Fixes 2020-08-03 23:31:38 +02:00			`area = infos.xpath('./li[contains(., "m²")]/text()').get()`
			`rooms = infos.xpath('./li[contains(., "p")]/text()').get()`
			`bedrooms = infos.xpath('./li[contains(., "ch")]/text()').get()`
			`if area is not None:`
Last changes 2021-12-12 21:58:25 +01:00			`area = area.replace(' m²', '').replace(',', '.')`
Unify yield values + leboncoin: Fixes 2020-08-03 23:31:38 +02:00			`if rooms is not None:`
			`rooms = rooms.replace(' p', '')`
			`if bedrooms is not None:`
			`bedrooms = bedrooms.replace(' ch', '')`

Initial commit 2020-07-02 08:40:13 +02:00			`yield {`
			`'url': ad.css(self.url_css_sel).get().split('?')[0],`
Unify yield values + leboncoin: Fixes 2020-08-03 23:31:38 +02:00			`'type': ad.css(self.type_css_sel).get().replace('/Villa', ''),`
PricePerSqm Pipeline 2020-08-16 18:55:03 +02:00			`'price': int(ad.css(self.price_css_sel).get().replace('\xa0', '').replace(' €', '')),`
Unify yield values + leboncoin: Fixes 2020-08-03 23:31:38 +02:00			`'rooms': rooms,`
			`'bedrooms': bedrooms,`
Last changes 2021-12-12 21:58:25 +01:00			`'area': area`
Initial commit 2020-07-02 08:40:13 +02:00			`}`
			`active_page = response.url`
			`active_page_nb = int(active_page.split('LISTING-LISTpg=')[1])`

			`current_ads_nb = response.xpath(self.pagination_xpath).getall()[4]`
			`total_ads_nb = response.xpath(self.pagination_xpath).getall()[-1]`

			`if current_ads_nb != total_ads_nb: # If not last page`
			`next_page_nb = active_page_nb + 1`
			`next_page = active_page.split('LISTING-LISTpg=')[0] + f'LISTING-LISTpg={next_page_nb}'`
Added User-Agent rotation 2020-08-16 01:13:20 +02:00			`yield response.follow(next_page, callback=self.parse)`