ImmoScrap/ImmoScrap/spiders/PAP.py

52 lines
1.9 KiB
Python

import scrapy
class PAPSpider(scrapy.Spider):
name = "pap"
baseurl = 'https://www.pap.fr'
ads_css_sel = '.search-list-item-alt'
url_css_sel = '.item-title::attr(href)'
price_css_sel = '.item-price::text'
infos_css_sel = '.item-tags' # Contains Rooms, bedrooms, area
next_page_xpath = '//a[@id="pagination-next"]/@href'
def start_requests(self):
urls = [
'https://www.pap.fr/annonce/vente-appartement-immeuble-maison-vienne-38200-g21767-jusqu-a-300000-euros',
'https://www.pap.fr/annonce/vente-appartement-immeuble-maison-saint-etienne-42-g43641-jusqu-a-300000-euros'
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
ads = response.css(self.ads_css_sel)
for ad in ads:
infos = ad.css(self.infos_css_sel)
url = ad.css(self.url_css_sel).get()
if url[0] != '/': # If url starts with / : on website, else: advertisement
break
rooms = infos.xpath('./li[contains(., "pièce")]/text()').get()
bedrooms = infos.xpath('./li[contains(., "chambre")]/text()').get()
if rooms is not None:
rooms = rooms.replace(' pièce', '').replace('s', '') # s, left if plural
if bedrooms is not None:
bedrooms = bedrooms.replace(' chambre', '').replace('s', '') # s, left if plural
yield {
'url': self.baseurl + url,
'type': 'Unknown',
'price': ad.css(self.price_css_sel).get().replace('.', '').replace('\xa0', ''),
'rooms': rooms,
'bedrooms': bedrooms,
'area': infos.xpath('./li[contains(small, "m")]/text()').get()
}
next_page = response.xpath(self.next_page_xpath).get()
if next_page:
yield response.follow(next_page, callback=self.parse)