From 42ad03f6e4fdc2dd70f0642ab522a57340ec94f4 Mon Sep 17 00:00:00 2001 From: Jordan ERNST Date: Thu, 2 Jul 2020 13:02:05 +0200 Subject: [PATCH] Added PAP spider --- ImmoScrap/spiders/PAP.py | 43 ++++++++++++++++++++++++++++++++++++ ImmoScrap/spiders/Seloger.py | 2 +- 2 files changed, 44 insertions(+), 1 deletion(-) create mode 100644 ImmoScrap/spiders/PAP.py diff --git a/ImmoScrap/spiders/PAP.py b/ImmoScrap/spiders/PAP.py new file mode 100644 index 0000000..a2e9b9c --- /dev/null +++ b/ImmoScrap/spiders/PAP.py @@ -0,0 +1,43 @@ +import scrapy + + +class PAPSpider(scrapy.Spider): + name = "pap" + + baseurl = 'https://www.pap.fr' + + ads_css_sel = '.search-list-item-alt' + url_css_sel = '.item-title::attr(href)' + price_css_sel = '.item-price::text' + infos_css_sel = '.item-tags' # Contains Rooms, bedrooms, area + + next_page_xpath = '//a[@id="pagination-next"]/@href' + + def start_requests(self): + urls = [ + 'https://www.pap.fr/annonce/vente-appartement-immeuble-maison-vienne-38200-g21767-jusqu-a-300000-euros' + ] + for url in urls: + yield scrapy.Request(url=url, callback=self.parse) + + def parse(self, response): + ads = response.css(self.ads_css_sel) + for ad in ads: + infos = ad.css(self.infos_css_sel) + + url = ad.css(self.url_css_sel).get() + if url[0] != '/': # If url starts with / : on website, else: advertisement + break + + yield { + 'url': self.baseurl + url, + 'type': 'Unknown', + 'price': ad.css(self.price_css_sel).get(), + 'rooms': infos.xpath('./li[contains(., "pièces")]/text()').get(), + 'bedrooms': infos.xpath('./li[contains(., "chambres")]/text()').get(), + 'area': infos.xpath('./li[contains(small, "m")]/text()').get() + } + + next_page = response.xpath(self.next_page_xpath).get() + if next_page: + yield response.follow(next_page, callback=self.parse) diff --git a/ImmoScrap/spiders/Seloger.py b/ImmoScrap/spiders/Seloger.py index ab24829..7e7d9f7 100644 --- a/ImmoScrap/spiders/Seloger.py +++ b/ImmoScrap/spiders/Seloger.py @@ -28,12 +28,12 @@ class SelogerSpider(scrapy.Spider): headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:78.0) Gecko/20100101 Firefox/78.0'} - pagination_xpath = '//div[has-class("ckWPHD")]//text()' ads_css_sel = '.ListContent__SmartClassifiedExtended-sc-1viyr2k-2' url_css_sel = '.dXJclF::attr(href)' type_css_sel = '.joPkKZ::text' price_css_sel = '.mVWFG::text' infos_css_sel = ".eJYQQA" # Contains Rooms, bedrooms, area + pagination_xpath = '//div[has-class("ckWPHD")]//text()' def start_requests(self): urls = [