Added PAP spider
This commit is contained in:
parent
9a2cc429d0
commit
42ad03f6e4
|
@ -0,0 +1,43 @@
|
||||||
|
import scrapy
|
||||||
|
|
||||||
|
|
||||||
|
class PAPSpider(scrapy.Spider):
|
||||||
|
name = "pap"
|
||||||
|
|
||||||
|
baseurl = 'https://www.pap.fr'
|
||||||
|
|
||||||
|
ads_css_sel = '.search-list-item-alt'
|
||||||
|
url_css_sel = '.item-title::attr(href)'
|
||||||
|
price_css_sel = '.item-price::text'
|
||||||
|
infos_css_sel = '.item-tags' # Contains Rooms, bedrooms, area
|
||||||
|
|
||||||
|
next_page_xpath = '//a[@id="pagination-next"]/@href'
|
||||||
|
|
||||||
|
def start_requests(self):
|
||||||
|
urls = [
|
||||||
|
'https://www.pap.fr/annonce/vente-appartement-immeuble-maison-vienne-38200-g21767-jusqu-a-300000-euros'
|
||||||
|
]
|
||||||
|
for url in urls:
|
||||||
|
yield scrapy.Request(url=url, callback=self.parse)
|
||||||
|
|
||||||
|
def parse(self, response):
|
||||||
|
ads = response.css(self.ads_css_sel)
|
||||||
|
for ad in ads:
|
||||||
|
infos = ad.css(self.infos_css_sel)
|
||||||
|
|
||||||
|
url = ad.css(self.url_css_sel).get()
|
||||||
|
if url[0] != '/': # If url starts with / : on website, else: advertisement
|
||||||
|
break
|
||||||
|
|
||||||
|
yield {
|
||||||
|
'url': self.baseurl + url,
|
||||||
|
'type': 'Unknown',
|
||||||
|
'price': ad.css(self.price_css_sel).get(),
|
||||||
|
'rooms': infos.xpath('./li[contains(., "pièces")]/text()').get(),
|
||||||
|
'bedrooms': infos.xpath('./li[contains(., "chambres")]/text()').get(),
|
||||||
|
'area': infos.xpath('./li[contains(small, "m")]/text()').get()
|
||||||
|
}
|
||||||
|
|
||||||
|
next_page = response.xpath(self.next_page_xpath).get()
|
||||||
|
if next_page:
|
||||||
|
yield response.follow(next_page, callback=self.parse)
|
|
@ -28,12 +28,12 @@ class SelogerSpider(scrapy.Spider):
|
||||||
|
|
||||||
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:78.0) Gecko/20100101 Firefox/78.0'}
|
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:78.0) Gecko/20100101 Firefox/78.0'}
|
||||||
|
|
||||||
pagination_xpath = '//div[has-class("ckWPHD")]//text()'
|
|
||||||
ads_css_sel = '.ListContent__SmartClassifiedExtended-sc-1viyr2k-2'
|
ads_css_sel = '.ListContent__SmartClassifiedExtended-sc-1viyr2k-2'
|
||||||
url_css_sel = '.dXJclF::attr(href)'
|
url_css_sel = '.dXJclF::attr(href)'
|
||||||
type_css_sel = '.joPkKZ::text'
|
type_css_sel = '.joPkKZ::text'
|
||||||
price_css_sel = '.mVWFG::text'
|
price_css_sel = '.mVWFG::text'
|
||||||
infos_css_sel = ".eJYQQA" # Contains Rooms, bedrooms, area
|
infos_css_sel = ".eJYQQA" # Contains Rooms, bedrooms, area
|
||||||
|
pagination_xpath = '//div[has-class("ckWPHD")]//text()'
|
||||||
|
|
||||||
def start_requests(self):
|
def start_requests(self):
|
||||||
urls = [
|
urls = [
|
||||||
|
|
Loading…
Reference in New Issue