Leboncoin working, need to parse now

2020-07-03 12:59:24 +02:00 · 2020-07-03 12:59:24 +02:00 · 2e52581737
parent 42ad03f6e4
commit 2e52581737
2 changed files with 59 additions and 36 deletions
--- a/ImmoScrap/spiders/Leboncoin.py
+++ b/ImmoScrap/spiders/Leboncoin.py
@ -1,36 +0,0 @@
-import scrapy
-
-
-filters = {"category": {"id": "9"},
-           "enums": {"real_estate_type": ["1", "2", "5"], "ad_type": ["offer"]},
-           "ranges": {"rooms": {}, "square": {}, "price": {"min": 0, "max": 300000}},
-           "location": {"area": {"lat": 45.521971, "lng": 4.869926, "radius": 1000}, "city_zipcodes": [], "departments": [], "disable_region": False, "locations": [], "regions": []},
-           "keywords": {"type": "all"},
-           "owner": {}}
-
-data = {"pivot": "0,0,0", "limit": 100, "limit_alu": 1,
-        "filters": filters,
-        "sort_by": "time", "sort_order": "desc"}
-
-headers = {"User-Agent": "LBC;Android;6.0;Android SDK built for x86;phone;616a1ca77ca70180;wwan;4.30.4.0;70400;3",
-           "api_key": "ba0c2dad52b3ec", "Content-Type": "application/json; charset=UTF-8", "Accept-Encoding": "gzip, deflate"}
-
-
-
-class LeboncoinSpider(scrapy.Spider):
-    name = "leboncoin"
-
-    def start_requests(self):
-        urls = [
-            'https://api.leboncoin.fr/api/adfinder/v1/search',
-            'http://quotes.toscrape.com/page/2/',
-        ]
-        for url in urls:
-            yield scrapy.Request(url=url, callback=self.parse)
-
-    def parse(self, response):
-        page = response.url.split("/")[-2]
-        filename = 'quotes-%s.html' % page
-        with open(filename, 'wb') as f:
-            f.write(response.body)
-        self.log('Saved file %s' % filename)
--- a/ImmoScrap/spiders/leboncoin.py
+++ b/ImmoScrap/spiders/leboncoin.py
@ -0,0 +1,59 @@
+import scrapy
+
+
+class LeboncoinSpider(scrapy.Spider):
+    name = "leboncoin"
+
+    apiurl = 'https://api.leboncoin.fr/api/adfinder/v1/search'
+    filters = {
+        "category": {"id": "9"},      # 9 : buy
+        "enums": {
+            "real_estate_type": ["1", "2", "5"],   # 1: houses, 2: appartments, 5: others
+            "ad_type": ["offer"],
+            "immo_sell_type": ["old"]
+        },
+        "ranges": {"price": {"max": 300000}},
+        "location": {
+            "area": {"lat": 45.521971, "lng": 4.869926, "radius": 1000},
+            "city_zipcodes": [],
+            "departments": [],
+            "disable_region": False,
+            "locations": [],
+            "regions": []
+        },
+        "keywords": {}
+    }
+
+    data = {
+        "pivot": "0,0,0",     # page cursor
+        "limit": 100,         # number of results par page (100 is server-side max)
+        "limit_alu": 1,       # 0 to return only statistics, 1 to also return listings
+        "offset": 0,
+        "filters": filters,
+        "sort_by": "time",
+        "sort_order": "desc"
+    }
+
+    headers = {
+        "User-Agent": "LBC;Android;6.0;Android SDK built for x86;phone;616a1ca77ca70180;wwan;4.30.4.0;70400;3",
+        "Accept-Encoding": "gzip, deflate",
+        "Accept-Language": "en-US,en;q=0.8,fr;q=0.6",
+        "Referer": "https://www.leboncoin.fr/recherche",
+        "Origin": "https://www.leboncoin.fr"
+    }
+
+    def start_requests(self):
+        url = self.apiurl
+        yield scrapy.http.JsonRequest(url=url, headers=self.headers, data=self.data, callback=self.parse)
+
+    def parse(self, response):
+        json = response.json()
+        ads = json["ads"]
+        for ad in ads:
+            pass
+
+        total_ads_nb = json["total"]
+        next_offset = self.data["offset"] + self.data["limit"]
+        if next_offset < total_ads_nb:   # If next page
+            self.data["offset"] = next_offset
+            yield scrapy.http.JsonRequest(self.apiurl, headers=self.headers, data=self.data, callback=self.parse)