import scrapy class PAPSpider(scrapy.Spider): name = "pap" baseurl = 'https://www.pap.fr' ads_css_sel = '.search-list-item-alt' url_css_sel = '.item-title::attr(href)' price_css_sel = '.item-price::text' infos_css_sel = '.item-tags' # Contains Rooms, bedrooms, area next_page_xpath = '//a[@id="pagination-next"]/@href' start_urls = [ 'https://www.pap.fr/annonce/vente-appartement-immeuble-maison-vienne-38200-g21767-jusqu-a-300000-euros', 'https://www.pap.fr/annonce/vente-appartement-immeuble-maison-saint-etienne-42-g43641-jusqu-a-300000-euros' ] def parse(self, response): ads = response.css(self.ads_css_sel) for ad in ads: infos = ad.css(self.infos_css_sel) url = ad.css(self.url_css_sel).get() if url[0] != '/': # If url starts with / : on website, else: advertisement break rooms = infos.xpath('./li[contains(., "pièce")]/text()').get() bedrooms = infos.xpath('./li[contains(., "chambre")]/text()').get() if rooms is not None: rooms = rooms.replace(' pièce', '').replace('s', '') # s, left if plural if bedrooms is not None: bedrooms = bedrooms.replace(' chambre', '').replace('s', '') # s, left if plural yield { 'url': self.baseurl + url, 'type': 'Unknown', 'price': ad.css(self.price_css_sel).get().replace('.', '').replace('\xa0€', ''), 'rooms': rooms, 'bedrooms': bedrooms, 'area': infos.xpath('./li[contains(small, "m")]/text()').get() } next_page = response.xpath(self.next_page_xpath).get() if next_page: yield response.follow(next_page, callback=self.parse)