Scraping a webiste with page limitation

Scraping a webiste with page limitation - python

I am trying to scrape https://www.olx.com.eg/en/properties/ listings and there it is showing 200,000+ ads and I'd like to scrape all 200,000 listings but pagination don't go above 49 pages. I have figure out their api endpoint from where data is coming through.
Api endpoint:
'https://search.olx.com.eg/_msearch?filter_path=took%2C*.took%2C*.suggest.*.options.text%2C*.suggest.*.options._source.*%2C*.hits.total.*%2C*.hits.hits._source.*%2C*.hits.hits.highlight.*%2C*.error%2C*.aggregations.*.buckets.key%2C*.aggregations.*.buckets.doc_count%2C*.aggregations.*.buckets.complex_value.hits.hits._source%2C*.aggregations.*.filtered_agg.facet.buckets.key%2C*.aggregations.*.filtered_agg.facet.buckets.doc_count%2C*.aggregations.*.filtered_agg.facet.buckets.complex_value.hits.hits._source'
POST data:
data = '{"index":"olx-eg-production-ads-ar"}\n{"from":0,"size":0,"track_total_hits":false,"query":{"bool":{"must":[{"term":{"category.slug":"properties"}}]}},"aggs":{"category.lvl1.externalID":{"global":{},"aggs":{"filtered_agg":{"filter":{"bool":{"must":[{"term":{"category.lvl0.externalID":"138"}}]}},"aggs":{"facet":{"terms":{"field":"category.lvl1.externalID","size":20}}}}}},"location.lvl1":{"global":{},"aggs":{"filtered_agg":{"filter":{"bool":{"must":[{"term":{"category.slug":"properties"}},{"term":{"location.lvl0.externalID":"0-1"}}]}},"aggs":{"facet":{"terms":{"field":"location.lvl1.externalID","size":20},"aggs":{"complex_value":{"top_hits":{"size":1,"_source":{"include":["location.lvl1"]}}}}}}}}},"product":{"global":{},"aggs":{"filtered_agg":{"filter":{"bool":{"must":[{"term":{"category.slug":"properties"}},{"term":{"product":"featured"}},{"term":{"location.externalID":"0-1"}}]}},"aggs":{"facet":{"terms":{"field":"product","size":20},"aggs":{"complex_value":{"top_hits":{"size":1,"_source":{"include":["product"]}}}}}}}}},"totalProductCount":{"global":{},"aggs":{"filtered_agg":{"filter":{"bool":{"must":[{"term":{"category.slug":"properties"}},{"term":{"product":"featured"}}]}},"aggs":{"facet":{"terms":{"field":"product","size":20},"aggs":{"complex_value":{"top_hits":{"size":1,"_source":{"include":["totalProductCount"]}}}}}}}}}}}\n{"index":"olx-eg-production-ads-ar"}\n{"from":0,"size":45,"track_total_hits":200000,"query":{"function_score":{"random_score":{"seed":97},"query":{"bool":{"must":[{"term":{"category.slug":"properties"}},{"term":{"product":"featured"}}]}}}},"sort":["_score"]}\n{"index":"olx-eg-production-ads-ar"}\n{"from":10045,"size":45,"track_total_hits":200000,"query":{"bool":{"must":[{"term":{"category.slug":"properties"}}]}},"sort":[{"timestamp":{"order":"desc"}},{"id":{"order":"desc"}}]}\n'
Problem is even this elasticsearch endpoint have a limitation of 10000 listings when I try to increase the from value in POST data it throws:
{"message":"[query_phase_execution_exception] Result window is too large, from + size must be less than or equal to: [10000] but was [10045]. See the scroll api for a more efficient way to request large data sets. This limit can be set by changing the [index.max_result_window] index level setting.","error":{"status":500}
I'd like to get all the 200,000 listings, any work around?
Here is my code:
import scrapy
from scrapy.crawler import CrawlerProcess
import requests
class OlxScraper(scrapy.Spider):
name = "olx-scraper"
custom_settings = {
"FEED_FORMAT": "csv",
"FEED_URI": "olx_eg_property_listing.csv",
"LOG_FILE": "olx_eg.log",
}
listing_endpoint = "https://search.olx.com.eg/_msearch?filter_path=took%2C*.took%2C*.suggest.*.options.text%2C*.suggest.*.options._source.*%2C*.hits.total.*%2C*.hits.hits._source.*%2C*.hits.hits.highlight.*%2C*.error%2C*.aggregations.*.buckets.key%2C*.aggregations.*.buckets.doc_count%2C*.aggregations.*.buckets.complex_value.hits.hits._source%2C*.aggregations.*.filtered_agg.facet.buckets.key%2C*.aggregations.*.filtered_agg.facet.buckets.doc_count%2C*.aggregations.*.filtered_agg.facet.buckets.complex_value.hits.hits._source"
headers = {
"authority": "search.olx.com.eg",
"accept": "*/*",
"accept-language": "en,ru;q=0.9",
"authorization": "Basic b2x4LWVnLXByb2R1Y3Rpb24tc2VhcmNoOn1nNDM2Q0R5QDJmWXs2alpHVGhGX0dEZjxJVSZKbnhL",
"content-type": "application/x-ndjson",
"origin": "https://www.olx.com.eg",
"referer": "https://www.olx.com.eg/",
"sec-ch-ua": '" Not A;Brand";v="99", "Chromium";v="100", "Yandex";v="22"',
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": '"Linux"',
"sec-fetch-dest": "empty",
"sec-fetch-mode": "cors",
"sec-fetch-site": "same-site",
"user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.143 YaBrowser/22.5.0.1879 (beta) Yowser/2.5 Safari/537.36",
}
data = '{{"index":"olx-eg-production-ads-ar"}}\n{{"from":0,"size":0,"track_total_hits":false,"query":{{"bool":{{"must":[{{"term":{{"category.slug":"properties"}}}}]}}}},"aggs":{{"category.lvl1.externalID":{{"global":{{}},"aggs":{{"filtered_agg":{{"filter":{{"bool":{{"must":[{{"term":{{"category.lvl0.externalID":"138"}}}}]}}}},"aggs":{{"facet":{{"terms":{{"field":"category.lvl1.externalID","size":20}}}}}}}}}}}},"location.lvl1":{{"global":{{}},"aggs":{{"filtered_agg":{{"filter":{{"bool":{{"must":[{{"term":{{"category.slug":"properties"}}}},{{"term":{{"location.lvl0.externalID":"0-1"}}}}]}}}},"aggs":{{"facet":{{"terms":{{"field":"location.lvl1.externalID","size":20}},"aggs":{{"complex_value":{{"top_hits":{{"size":1,"_source":{{"include":["location.lvl1"]}}}}}}}}}}}}}}}}}},"product":{{"global":{{}},"aggs":{{"filtered_agg":{{"filter":{{"bool":{{"must":[{{"term":{{"category.slug":"properties"}}}},{{"term":{{"product":"featured"}}}},{{"term":{{"location.externalID":"0-1"}}}}]}}}},"aggs":{{"facet":{{"terms":{{"field":"product","size":20}},"aggs":{{"complex_value":{{"top_hits":{{"size":1,"_source":{{"include":["product"]}}}}}}}}}}}}}}}}}},"totalProductCount":{{"global":{{}},"aggs":{{"filtered_agg":{{"filter":{{"bool":{{"must":[{{"term":{{"category.slug":"properties"}}}},{{"term":{{"product":"featured"}}}}]}}}},"aggs":{{"facet":{{"terms":{{"field":"product","size":20}},"aggs":{{"complex_value":{{"top_hits":{{"size":1,"_source":{{"include":["totalProductCount"]}}}}}}}}}}}}}}}}}}}}}}\n{{"index":"olx-eg-production-ads-ar"}}\n{{"from":45,"size":0,"track_total_hits":200000,"query":{{"function_score":{{"random_score":{{"seed":97}},"query":{{"bool":{{"must":[{{"term":{{"category.slug":"properties"}}}},{{"term":{{"product":"featured"}}}}]}}}}}}}},"sort":["_score"]}}\n{{"index":"olx-eg-production-ads-ar"}}\n{{"from":{},"size":45,"track_total_hits":200000,"query":{{"bool":{{"must":[{{"term":{{"category.slug":"properties"}}}}]}}}},"sort":[{{"timestamp":{{"order":"desc"}}}},{{"id":{{"order":"desc"}}}}]}}\n'
def start_requests(self):
for i in range(0, 100045):
pg = i + 45
yield scrapy.Request(
url=self.listing_endpoint,
method="POST",
headers=self.headers,
body=self.data.format(pg),
callback=self.parse_links,
)
def parse_links(self, response):
try:
listing_data = response.json()["responses"][2]["hits"]["hits"]
except:
listing_data = response.json()["responses"][1]["hits"]["hits"]
for listing in listing_data:
listing_id = listing["_source"]["externalID"]
listing_url = "https://www.olx.com.eg/en/ad/" + listing_id
yield scrapy.Request(
url=listing_url,
headers=self.headers,
callback=self.parse_details,
meta={"listing_url": listing_url},
)
def parse_details(self, response):
item = {}
reference_id = response.css("div._171225da::text").get().replace("Ad id ", "")
sub_detail_list = response.css("div._676a547f ::text").extract()
item["URL"] = response.meta.get("listing_url")
try:
item["Breadcrumb"] = (
response.css("li._8c543153 ::text")[4].get()
+ "/"
+ response.css("li._8c543153 ::text")[3].get()
+ "/"
+ response.css("li._8c543153 ::text")[2].get()
+ "/"
+ response.css("li._8c543153 ::text")[1].get()
+ "/"
+ response.css("li._8c543153 ::text").get()
)
except:
item["Breadcrumb"] = (
+response.css("li._8c543153 ::text")[3].get()
+ "/"
+ response.css("li._8c543153 ::text")[2].get()
+ "/"
+ response.css("li._8c543153 ::text")[1].get()
+ "/"
+ response.css("li._8c543153 ::text").get()
)
item["Price"] = response.css("span._56dab877 ::text").get()
item["Title"] = response.css("h1.a38b8112::text").get()
item["Type"] = response.css("div.b44ca0b3 ::text")[1].get()
item["Bedrooms"] = response.css("span.c47715cd::text").get()
try:
item["Bathrooms"] = response.css("span.c47715cd::text")[1].get()
except:
item["Bathrooms"] = ""
try:
item["Area"] = response.css("span.c47715cd::text")[2].get()
except:
for sub in sub_detail_list:
if "Area (m²)" in sub_detail_list:
item["Area"] = sub_detail_list[
sub_detail_list.index("Area (m²)") + 1
]
else:
item["Area"] = ""
item["Location"] = response.css("span._8918c0a8::text").get()
try:
if response.css("div.b44ca0b3 ::text")[18].get() == "Compound":
item["Compound"] = response.css("div.b44ca0b3 ::text")[19].get()
elif response.css("div.b44ca0b3 ::text")[16].get() == "Compound":
item["Compound"] = response.css("div.b44ca0b3 ::text")[17].get()
except:
item["Compound"] = ""
item["seller"] = response.css("span._261203a9._2e82a662::text").getall()[1]
member_since = response.css("span._34a7409b ::text")[1].get()
if member_since == "Cars for Sale":
item["Seller_member_since"] = response.css("span._34a7409b ::text").get()
if "Commercial ID: " in member_since:
item["Seller_member_since"] = response.css("span._34a7409b ::text")[2].get()
else:
item["Seller_member_since"] = member_since
res = requests.get(
f"https://www.olx.com.eg/api/listing/{reference_id}/contactInfo/"
)
item["Seller_phone_number"] = res.json()["mobile"]
item["Description"] = (
response.css("div._0f86855a ::text").get().replace("\n", "")
)
item["Amenities"] = ",".join(response.css("div._27f9c8ac ::text").extract())
item["Reference"] = reference_id
item["Listed_date"] = response.css("span._8918c0a8 ::text")[1].get()
item["Level"] = ""
item["Payment_option"] = ""
item["Delivery_term"] = ""
item["Furnished"] = ""
item["Delivery_date"] = ""
item["Down_payment"] = ""
for sub_detail in sub_detail_list:
if "Level" in sub_detail_list:
item["Level"] = sub_detail_list[sub_detail_list.index("Level") + 1]
if "Payment Option" in sub_detail_list:
item["Payment_option"] = sub_detail_list[
sub_detail_list.index("Payment Option") + 1
]
if "Delivery Term" in sub_detail_list:
item["Delivery_term"] = sub_detail_list[
sub_detail_list.index("Delivery Term") + 1
]
if "Furnished" in sub_detail_list:
item["Furnished"] = sub_detail_list[
sub_detail_list.index("Furnished") + 1
]
if "Delivery Date" in sub_detail_list:
item["Delivery_date"] = sub_detail_list[
sub_detail_list.index("Delivery Date") + 1
]
if "Down Payment" in sub_detail_list:
item["Down_payment"] = sub_detail_list[
sub_detail_list.index("Down Payment") + 1
]
item["Image_url"] = response.css("picture._219b7e0a ::attr(srcset)")[1].get()
yield item
# main driver
if __name__ == "__main__":
# run scrapper
process = CrawlerProcess()
process.crawl(OlxScraper)
process.start()
How to overcome this 10000 limit? Thanks in Advance.

Related

Scraped youtube comments amount and real amount are different

Im new to Python and Im trying to code a commentscraper for youtube with the most important informations, which I put in a JSON-file. But the my amount of comments and replys is not the same as on Youtube. I don't know, where my error is. I recognized, that it doesn't write any data in the files, if there are less than 20 comments, but I don't know, where I have to change something...
Example:
https://youtu.be/Re1m9O7q-9U here I get 102, but it should be 107
https://youtu.be/Q9Y5m1fQ7Fk here I get 423, but it should be 486
https://youtu.be/cMhE5BfmFkM here I get 1315, but it should be 2052
Here is the code:
class YT_Comments:
def __init__(self, api_key):
self.api_key = api_key
self.comment_int = 0
def get_video_comments(self, video_id, limit):
url = f"https://youtube.googleapis.com/youtube/v3/commentThreads?part=replies%2C%20snippet&order=relevance&videoId={video_id}&key={self.api_key}"
vid_comments = []
pc, npt = self._get_comments_per_page(url)
if limit is not None and isinstance(limit, int):
url += f"&maxResults={str(limit)}"
while (npt is not None):
nexturl = url + "&pageToken=" + npt
pc, npt = self._get_comments_per_page(nexturl)
vid_comments.append(pc)
print(self.comment_int)
print(len(vid_comments))
return vid_comments
def _get_comments_per_page(self, url):
json_url = requests.get(url)
data = json.loads(json_url.text)
page_comments = []
if "items" not in data:
return page_comments, None
item_data = data["items"]
nextPageToken = data.get("nextPageToken", None)
for item in tqdm.tqdm(item_data):
try:
kind = item["kind"]
if kind == "youtube#comment" or "youtube#commentThread":
comment_text = item["snippet"]["topLevelComment"]["snippet"]["textOriginal"]
comment_author = item["snippet"]["topLevelComment"]["snippet"]["authorDisplayName"]
author_id = item["snippet"]["topLevelComment"]["snippet"]["authorChannelId"]["value"]
comment_like_count = item["snippet"]["topLevelComment"]["snippet"]["likeCount"]
comment_date = item["snippet"]["topLevelComment"]["snippet"]["publishedAt"]
comment = {"comment_text" : comment_text,
"comment_author" : comment_author,
"comment_author_id" : author_id,
"comment_like_count" : comment_like_count,
"comment_date" : comment_date}
replies_l = []
self.comment_int += 1
try:
replies = item["replies"]["comments"]
for reply in replies:
reply_txt = reply["snippet"]["textOriginal"]
reply_author = reply["snippet"]["authorDisplayName"]
reply_author_id = reply["snippet"]["authorChannelId"]["value"]
reply_like_count = reply["snippet"]["likeCount"]
reply_date = reply["snippet"]["publishedAt"]
reply_dict = {"text" : reply_txt,
"author" : reply_author,
"author_id" : reply_author_id,
"likes" : reply_like_count,
"date" : reply_date}
replies_l.append(reply_dict)
self.comment_int +=1
except KeyError:
replies_l.append(None)
comment_dict = {
"comment": comment,
"replies": replies_l,
}
page_comments.append(comment_dict)
except KeyError:
print("No Comments")
return page_comments, nextPageToken

Can i use while loop with 'i' as a variable which will be used in tr[i] in xpath?

import scrapy
import logging
class AssetSpider(scrapy.Spider):
name = 'asset'
start_urls = ['http://mnregaweb4.nic.in/netnrega/asset_report_dtl.aspx?lflag=eng&state_name=WEST%20BENGAL&state_code=32&district_name=NADIA&district_code=3201&block_name=KRISHNAGAR-I&block_code=&panchayat_name=DOGACHI&panchayat_code=3201009009&fin_year=2020-2021&source=national&Digest=8+kWKUdwzDQA1IJ5qhD8Fw']
def parse(self, response):
i = 4
while i<2236:
assetid = response.xpath("//table[2]//tr['i']/td[2]/text()")
assetcategory = response.xpath("//table[2]//tr['i']/td[3]/text()")
schemecode = response.xpath("//table[2]//tr['i']/td[5]/text()")
link = response.xpath("//table[2]//tr['i']/td[6]/a/#href")
schemename = response.xpath("//table[2]//tr['i']/td[7]/text()")
yield {
'assetid' : assetid,
'assetcategory' : assetcategory,
'schemecode' : schemecode,
'link' : link,
'schemename' : schemename
}
i += 1
I want to use 'i' variable to loop in the xpath of tr[position] from 4 to 2235. i just dont know if it is possible! and if it is possible, then what is the right way to do it? mine does not work.

Sure, it is possible and widely used.
You can format the string with variable.
There are several syntaxes for that.
For example you can do it like this:
i = 4
while i<2236:
assetid_path = "//table[2]//tr[{1}]/td[2]/text()".format(i)
assetcategory_path = "//table[2]//tr[{1}]/td[3]/text()".format(i)
schemecode_path = "//table[2]//tr[{1}]/td[5]/text()".format(i)
link_path = "//table[2]//tr[{1}]/td[6]/a/#href".format(i)
schemename_path = "//table[2]//tr[{1}]/td[7]/text()".format(i)
assetid = response.xpath(assetid_path)
assetcategory = response.xpath(assetcategory_path)
schemecode = response.xpath(schemecode_path)
link = response.xpath(link_path)
schemename = response.xpath(schemename_path)
yield {
'assetid' : assetid,
'assetcategory' : assetcategory,
'schemecode' : schemecode,
'link' : link,
'schemename' : schemename
}
i += 1
While the above can be shortened like this:
i = 4
while i<2236:
root_path = "//table[2]//tr[{1}]".format(i)
assetid_path = root_path + "/td[2]/text()"
assetcategory_path = root_path + "/td[3]/text()"
schemecode_path = root_path + "/td[5]/text()"
link_path = root_path + "/td[6]/a/#href"
schemename_path = root_path + "/td[7]/text()"
assetid = response.xpath(assetid_path)
assetcategory = response.xpath(assetcategory_path)
schemecode = response.xpath(schemecode_path)
link = response.xpath(link_path)
schemename = response.xpath(schemename_path)
yield {
'assetid' : assetid,
'assetcategory' : assetcategory,
'schemecode' : schemecode,
'link' : link,
'schemename' : schemename
}
i += 1
But the better way is to use bind variable. As following:
i = 4
while i<2236:
assetid = response.xpath("//table[2]//tr[$i]/td[2]/text()",i=i))
assetcategory = response.xpath("//table[2]//tr[$i]/td[3]/text()",i=i))
schemecode = response.xpath("//table[2]//tr[$i]/td[5]/text()",i=i)
link = response.xpath("//table[2]//tr[$i]/td[6]/a/#href",i=i)
schemename = response.xpath("//table[2]//tr[$i]/td[7]/text()",i=i)
yield {
'assetid' : assetid,
'assetcategory' : assetcategory,
'schemecode' : schemecode,
'link' : link,
'schemename' : schemename
}
i += 1

You send string to xpath so I would suggest to use formating... eg.:
response.xpath(f"//table[2]//tr[{i}]/td[2]/text()")

Scraping view function remembers its previous iterations

I have the following view function used to scrape data:
def results(request):
if request.method == 'POST':
form = RoomForm(request.POST)
if form.is_valid():
form_city = form.cleaned_data['city'].title()
form_country = form.cleaned_data['country'].title()
form_arrival_date = form.cleaned_data['arrival_date']
form_departure_date = form.cleaned_data['departure_date']
form_pages_to_scrape = form.cleaned_data['pages_to_scrape']
#launch scraper
scraper = AIRBNB_scraper(city=form_city, country=form_country, arrival_date=str(form_arrival_date), departure_date=str(form_departure_date))
scraped_dataframe = scraper.scrape_multiple_pages(last_page_selector_number=form_pages_to_scrape)
scraped_dataframe_sorted = scraped_dataframe.sort_values('prices')
print(scraped_dataframe_sorted)
#convert scraped dataframe into lists
prices = scraped_dataframe_sorted['prices'].tolist()
listings_links = scraped_dataframe_sorted['listings_links'].tolist()
listings_names = scraped_dataframe_sorted['listings_names'].tolist()
photo_links = scraped_dataframe_sorted['photo_links'].tolist()
dictionary = zip(prices, listings_links, listings_names, photo_links)
context = {'dictionary': dictionary}
return render(request, 'javascript/results.html', context)
On form submit, a post request is sent to this function using AJAX:
var frm = $('#login-form');
frm.submit(function () {
$.ajax({
type: "POST",
url: "/results",
data: frm.serialize(),
success: function (data) {
$("#table").html(data);
$('#go_back').remove();
},
error: function(data) {
$("#table").html("Something went wrong!");
}
});
return false;
});
After that the scraped data is displayed as HTML table on the same page the form is on.
The problem is the number of scraped items doubles every time the form submit is done. So for example if the number of scraped items on first button click is sixteen, the output will be 16, but on the second run it will be 32, then 64, and so on.
It is like the app remembers previous form submits, but I don't see any reason why. I tried clearin - at the end of this function - the pandas dataframe used to store the scraped data and also the dictionary passed as context, but to no avail.
The form is:
class RoomForm(forms.Form):
city = forms.CharField(max_length=100)
country = forms.CharField(max_length=100)
arrival_date = forms.DateField(widget=forms.DateInput(attrs=
{
'class':'datepicker'
}), required=False)
departure_date = forms.DateField(widget=forms.DateInput(attrs=
{
'class':'datepicker'
}), required=False)
pages_to_scrape = forms.IntegerField(label='Pages to scrape (max. 17)', min_value=0, max_value=17, widget=forms.NumberInput(attrs={'style':'width: 188px'}))
AIRBNB_scraper is:
import requests, bs4
import re
import pandas as pd
price_pattern = re.compile(r'\d*\s*?,?\s*?\d*\szł')
photo_link_pattern = re.compile(r'https.*\)')
prices = []
listings_links = []
photo_links = []
listings_names = []
class AIRBNB_scraper():
def __init__(self, city, country, accomodation_type='homes', arrival_date='2018-03-25', departure_date='2018-04-10'):
self.city = city
self.country = country
self.arrival_date = arrival_date
self.departure_date = departure_date
self.accomodation_type = accomodation_type
def make_soup(self, page_number):
url = 'https://www.airbnb.pl/s/'+ self.city +'--'+ self.country +'/'+ self.accomodation_type +'?query='+ self.city +'%2C%20'+ self.country +'&refinement_paths%5B%5D=%2F'+ self.accomodation_type +'&checkin=' + self.arrival_date + '&checkout=' + self.departure_date + '&section_offset=' + str(page_number)
response = requests.get(url)
soup = bs4.BeautifulSoup(response.text, "html.parser")
return soup
def get_listings(self, page_number):
soup = self.make_soup(page_number)
listings = soup.select('._f21qs6')
number_of_listings = len(listings)
print('\n' + "Number of listings found: " + str(number_of_listings))
while number_of_listings != 18:
print('\n' + str(number_of_listings) + ' is not correct number of listings, it should be 18. Trying again now.')
soup = self.make_soup(page_number)
listings = soup.find_all('div', class_='_f21qs6')
number_of_listings = len(listings)
print('\n' + "All fine! The number of listings is: " + str(number_of_listings) + '. Starting scraping now')
return listings
def scrape_listings_per_page(self, page_number):
listings_to_scrape = self.get_listings(page_number)
for listing in listings_to_scrape:
#get price
price_container = listing.find_all('span', class_='_hylizj6')
price_search = re.search(price_pattern, str(price_container))
price = price_search.group()
#get listing_link
listing_link = 'https://www.airbnb.pl' + listing.find('a', class_='_15ns6vh')['href']
#get photo_link
photo_link_node = listing.find('div', class_="_1df8dftk")['style']
photo_link_search = re.search(photo_link_pattern, str(photo_link_node))
#~ if photo_link_search:
#~ print('Is regex match')
#~ else:
#~ print('No regex match')
photo_link_before_strip = photo_link_search.group()
photo_link = photo_link_before_strip[:-1] #remove ") at the end of link
#get listing_name
listing_name = listing.find('div', class_='_1rths372').text
#append lists
prices.append(price)
listings_links.append(listing_link)
photo_links.append(photo_link)
listings_names.append(listing_name)
def scrape_multiple_pages(self, last_page_selector_number):
last_page_selector_number += 1
for x in range(0, last_page_selector_number):#18
self.scrape_listings_per_page(x)
print('\n' + "INDEX OF PAGE BEING SCRAPED: " + str(x))
scraped_data = pd.DataFrame({'prices': prices,
'listings_links': listings_links,
'photo_links': photo_links,
'listings_names': listings_names})
return scraped_data

You have module-level variables: prices, listings_links, etc. You append to these inside your AIRBNB_scraper instance but they are not part of that instance, and will persist between calls. You should make them instance attributes - define them as self.prices etc in the __init__ method.

pycurl error: ALPN, server did not agree to a protocol

I want to integrate PayTR payment gateway with odoo10. I followed the PayTR's developers guide and found a sample codes php script in it. After converting php script to python script, I am getting the error:
ALPN, server did not agree to a protocol
Here are the logs:
logs
Below is the piece of code:
#http.route('/my/payment', type='http', auth="user", website=True)
def pay_now(self, **kw):
partner = request.env.user.partner_id
res = request.env['account.payment'].sudo().search([('partner_id' , '=', partner.id)])
print res
acquirer = request.env['payment.acquirer'].sudo().search([('provider', '=', 'paytr')])
merchant_id = acquirer.paytr_seller_account
merchant_key = acquirer.paytr_merchant_key
merchant_salt = acquirer.paytr_merchant_salt
email = acquirer.paytr_email_account
payment_amount = 300 #example
merchant_oid = "50" #example
merchant_ok_url = "http://www.example.com/success.php"
merchant_fail_url = "http://www.example.com/error.php"
user_basket = "sample product" #example
user_basket = base64.b64encode(bytes(user_basket).encode('utf-8'))
user_ip = get('https://api.ipify.org').text
print user_ip
timeout_limit = "30"
debug_on = 1
test_mode = 0
no_installment = 0
max_installment = 0
currency = "TL"
hash_str = str(merchant_id) + str(user_ip) + str(merchant_oid) + str(email) + str(payment_amount) + str(user_basket) + str(no_installment) + str(max_installment) + str(currency) + str(test_mode)
print hash_str
str_salt = hash_str + str(merchant_salt)
print str_salt
message = bytes(str_salt).encode('utf-8')
secret = bytes(merchant_key).encode('utf-8')
# paytr_token = base64.b64encode(hmac.new(str(merchant_key), str_salt, digestmod = hashlib.sha256).digest())
paytr_token = base64.b64encode(hmac.new(secret, message, digestmod = hashlib.sha256).digest())
print paytr_token
values = {
'merchant_id' : merchant_id,
'user_ip' : user_ip ,
'merchant_oid' : merchant_oid ,
'email' : str(email) ,
'payment_amount' : int(payment_amount) ,
'currency' : str(currency) ,
'user_basket' : user_basket,
'no_installment' : no_installment,
'max_installment' : max_installment,
'paytr_token' : paytr_token,
'user_name' : str(partner.name),
'user_address' : str(partner.street),
'user_phone' : str(partner.phone),
'merchant_ok_url' : str(merchant_ok_url),
'merchant_fail_url' : str(merchant_fail_url),
'test_mode' : test_mode,
'debug_on' : int(debug_on),
'timeout_limit' : int(timeout_limit),
'lang' : "en",
}
print values
post_data = urlencode(values)
print "postfields : " +post_data
buff = cStringIO.StringIO()
buff = StringIO()
c = pycurl.Curl()
c.setopt(c.URL, 'https://www.paytr.com/odeme/api/get-token')
c.setopt(c.POSTFIELDS, post_data)
c.setopt(c.VERBOSE, True)
c.setopt(c.WRITEDATA, buff)
resp_data = c.perform()
print('Status: %d' % c.getinfo(c.RESPONSE_CODE))
print('Status: %f' % c.getinfo(c.TOTAL_TIME))
c.close()
data = buff.getvalue().decode('utf-8')
buff.close()
resp_data = json.load(resp_data)
if resp_data['status'] =='success':
token = resp_data['token']
else :
print resp_data['reason']
template = 'retailer_payment.pay_now'
return request.render(template, token)
With this, result contains nothing and throws:
AttributeError: 'NoneType' object has no attribute 'read'
Can someone help me to understand what's wrong here?

python script to download youtube video

On giving youtube video url, I first download video page and extract javascript object between
<script>var ytplayer = ytplayer ..... </script>
I got
{
"args": {
"is_listed": "1",
"account_playback_token": "QUFFLUhqbWdXR1NfQjRiRmNzWVhRVTM0ajlNcnM1alVUd3xBQ3Jtc0tsVi01WFp5VmV2MTU3RnpkYUVkRzVqR1ZTNUI4T2JaQzk1ckxPejdVNkYzUk5zOTdjZnNmb1BYZHNLQ05nblZZbFk2ZWJXNHRPNVFoNVVNc2RjTE1YekdKSGY4dlVhSnlCU1ctNFZJdXBKbWhIRG1TZw==",
"ptk": "RajshriEntertainment",
"focEnabled": "1",
"tag_for_child_directed": false,
"adaptive_fmts": ......,
"probe_url": .....,
"rmktEnabled": "1",
"allow_ratings": "1",
"dbp": "ChoKFk5RNTV5UGs5bDZmSk5wSjQ4a3RiSHcQARABGAI",
"cc3_module": "1",
"no_get_video_log": "1",
"fmt_list": ......,
"title":..........,
"invideo": true,
"sffb": true,
"iurlmq_webp": ,
"cosver": "10_8_4",
"url_encoded_fmt_stream_map": .................,
"max_dynamic_allocation_ad_tag_length": "2040",
"innertube_api_key": "AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8",
"timestamp": "1446586407",
"cc_asr": "1",
"apiary_host_firstparty": "",
"adsense_video_doc_id": "yt_Vd4iNPuRlx4",
"innertube_context_client_version": "1.20151102",
"mpu": true,
"tmi": "1",
"ldpj": "-19",
"fade_out_duration_milliseconds": "1000",
.........
}
}
i found key adaptive_fmts and url_encoded_fmt_stream_map contain multiple url in percent-encoded form.
i take one url from url_encoded_fmt_stream_map it look like this
https://r1---sn-o3o-qxal.googlevideo.com/videoplayback?
ratebypass=yes&
signature=982E413BBE08CA5801420F9696E0F2ED691B99FA.D666D39D1A0AF066F76F12632A10D3B8076076CE&
lmt=1443906393476832&
expire=1446604919&
fexp=9406983%2C9408710%2C9414764%2C9416126%2C9417707%2C9421410%2C9422596%2C9423663&
itag=22&
dur=128.801&
source=youtube&
upn=pk2CEhVBeFM&
sver=3&
key=yt6&
id=o-AK-OlE5NUsbkp51EZY2yKuz5vsSGofgUvrvTtOrhC72e&
sparams=dur%2Cid%2Cinitcwndbps%2Cip%2Cipbits%2Citag%2Clmt%2Cmime%2Cmm%2Cmn%2Cms%2Cmv%2Cpl%2Cratebypass%2Crequiressl%2Csource%2Cupn%2Cexpire&
mime=video%2Fmp4&
ipbits=0&
pl=21&
ip=x.y.z.a&
initcwndbps=5405000&
requiressl=yes&
mn=sn-o3o-qxal&
mm=31&
ms=au&
mv=m&
mt=1446583222&
itag=22&
type=video/mp4
but when I paste this(above) url in browser nothing happen, I mean not work.
Please help me.
Also
What is difference between adaptive_fmts and url_encoded_fmt_stream_map containing urls?

In python2.7, this works:
import urlparse, urllib2
vid = "vzS1Vkpsi5k"
save_title = "YouTube SpaceX - Booster Number 4 - Thaicom 8 06-06-2016"
url_init = "https://www.youtube.com/get_video_info?video_id=" + vid
resp = urllib2.urlopen(url_init, timeout=10)
data = resp.read()
info = urlparse.parse_qs(data)
title = info['title']
print "length: ", info['length_seconds'][0] + " seconds"
stream_map = info['adaptive_fmts'][0]
vid_info = stream_map.split(",")
mp4_filename = save_title + ".mp4"
for video in vid_info:
item = urlparse.parse_qs(video)
#print 'quality: ', item['quality'][0]
#print 'type: ', item['type'][0]
url_download = item['url'][0]
resp = urllib2.urlopen(url_download)
print resp.headers
length = int(resp.headers['Content-Length'])
my_file = open(mp4_filename, "w+")
done, i = 0, 0
buff = resp.read(1024)
while buff:
my_file.write(buff)
done += 1024
percent = done * 100.0 / length
buff = resp.read(1024)
if not i%1000:
percent = done * 100.0 / length
print str(percent) + "%"
i += 1
break

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Scraping a webiste with page limitation - python

Related

Scraped youtube comments amount and real amount are different

Can i use while loop with 'i' as a variable which will be used in tr[i] in xpath?

Scraping view function remembers its previous iterations

pycurl error: ALPN, server did not agree to a protocol

python script to download youtube video

Categories

Resources