101 lines
3.2 KiB
Python
101 lines
3.2 KiB
Python
import scrapy
|
|
|
|
|
|
def get_attributes(attributes):
|
|
for attribute in attributes:
|
|
if attribute['key'] == 'real_estate_type':
|
|
type = attribute['value_label']
|
|
elif attribute['key'] == 'rooms':
|
|
rooms = attribute['value']
|
|
elif attribute['key'] == 'square':
|
|
area = attribute['value']
|
|
try:
|
|
type
|
|
except NameError:
|
|
type = None
|
|
try:
|
|
rooms
|
|
except NameError:
|
|
rooms = None
|
|
try:
|
|
area
|
|
except NameError:
|
|
area = None
|
|
return type, rooms, area
|
|
|
|
|
|
class LeboncoinSpider(scrapy.Spider):
|
|
name = "leboncoin"
|
|
|
|
apiurl = 'https://api.leboncoin.fr/api/adfinder/v1/search'
|
|
|
|
areas = (
|
|
{"lat": 45.521971, "lng": 4.869926, "radius": 1000}, # Vienne
|
|
{"lat": 45.437621, "lng": 4.388003, "radius": 1000} # Saint-Etienne
|
|
)
|
|
|
|
filters = {
|
|
"category": {"id": "9"}, # 9 : buy
|
|
"enums": {
|
|
"real_estate_type": ["1", "2", "5"], # 1: houses, 2: appartments, 5: others
|
|
"ad_type": ["offer"],
|
|
"immo_sell_type": ["old"]
|
|
},
|
|
"ranges": {"price": {"max": 300000}},
|
|
"location": {
|
|
"city_zipcodes": [],
|
|
"departments": [],
|
|
"disable_region": False,
|
|
"locations": [],
|
|
"regions": []
|
|
},
|
|
"keywords": {}
|
|
}
|
|
|
|
headers = {
|
|
"User-Agent": "LBC;Android;6.0;Android SDK built for x86;phone;616a1ca77ca70180;wwan;4.30.4.0;70400;3",
|
|
"Accept-Encoding": "gzip, deflate",
|
|
"Accept-Language": "en-US,en;q=0.8,fr;q=0.6",
|
|
"Referer": "https://www.leboncoin.fr/recherche",
|
|
"Origin": "https://www.leboncoin.fr"
|
|
}
|
|
|
|
def start_requests(self):
|
|
url = self.apiurl
|
|
for area in self.areas:
|
|
filters = self.filters.copy()
|
|
filters['location']['area'] = area
|
|
|
|
self.data = {
|
|
"pivot": "0,0,0", # page cursor
|
|
"limit": 100, # number of results par page (100 is server-side max)
|
|
"limit_alu": 1, # 0 to return only statistics, 1 to also return listings
|
|
"offset": 0,
|
|
"filters": filters,
|
|
"sort_by": "time",
|
|
"sort_order": "desc"
|
|
}
|
|
yield scrapy.http.JsonRequest(url=url, headers=self.headers, data=self.data, callback=self.parse)
|
|
|
|
def parse(self, response):
|
|
json = response.json()
|
|
ads = json["ads"]
|
|
for ad in ads:
|
|
attributes = ad['attributes']
|
|
type, rooms, area = get_attributes(attributes)
|
|
|
|
yield {
|
|
'url': ad['url'],
|
|
'type': type,
|
|
'price': int(ad['price'][0]),
|
|
'rooms': rooms,
|
|
'bedrooms': 'Unknown',
|
|
'area': area
|
|
}
|
|
|
|
total_ads_nb = json["total"]
|
|
next_offset = self.data["offset"] + self.data["limit"]
|
|
if next_offset < total_ads_nb: # If next page
|
|
self.data["offset"] = next_offset
|
|
yield scrapy.http.JsonRequest(self.apiurl, headers=self.headers, data=self.data, callback=self.parse)
|