import scrapy def get_attributes(attributes): for attribute in attributes: if attribute['key'] == 'real_estate_type': type = attribute['value_label'] elif attribute['key'] == 'rooms': rooms = attribute['value'] elif attribute['key'] == 'square': area = attribute['value'] try: type except NameError: type = None try: rooms except NameError: rooms = None try: area except NameError: area = None return type, rooms, area class LeboncoinSpider(scrapy.Spider): name = "leboncoin" apiurl = 'https://api.leboncoin.fr/api/adfinder/v1/search' areas = ( {"lat": 45.521971, "lng": 4.869926, "radius": 1000}, # Vienne {"lat": 45.437621, "lng": 4.388003, "radius": 1000} # Saint-Etienne ) filters = { "category": {"id": "9"}, # 9 : buy "enums": { "real_estate_type": ["1", "2", "5"], # 1: houses, 2: appartments, 5: others "ad_type": ["offer"], "immo_sell_type": ["old"] }, "ranges": {"price": {"max": 300000}}, "location": { "city_zipcodes": [], "departments": [], "disable_region": False, "locations": [], "regions": [] }, "keywords": {} } headers = { "User-Agent": "LBC;Android;6.0;Android SDK built for x86;phone;616a1ca77ca70180;wwan;4.30.4.0;70400;3", "Accept-Encoding": "gzip, deflate", "Accept-Language": "en-US,en;q=0.8,fr;q=0.6", "Referer": "https://www.leboncoin.fr/recherche", "Origin": "https://www.leboncoin.fr" } def start_requests(self): url = self.apiurl for area in self.areas: filters = self.filters.copy() filters['location']['area'] = area self.data = { "pivot": "0,0,0", # page cursor "limit": 100, # number of results par page (100 is server-side max) "limit_alu": 1, # 0 to return only statistics, 1 to also return listings "offset": 0, "filters": filters, "sort_by": "time", "sort_order": "desc" } yield scrapy.http.JsonRequest(url=url, headers=self.headers, data=self.data, callback=self.parse) def parse(self, response): json = response.json() ads = json["ads"] for ad in ads: attributes = ad['attributes'] type, rooms, area = get_attributes(attributes) yield { 'url': ad['url'], 'type': type, 'price': int(ad['price'][0]), 'rooms': rooms, 'bedrooms': 'Unknown', 'area': area } total_ads_nb = json["total"] next_offset = self.data["offset"] + self.data["limit"] if next_offset < total_ads_nb: # If next page self.data["offset"] = next_offset yield scrapy.http.JsonRequest(self.apiurl, headers=self.headers, data=self.data, callback=self.parse)