I'm editing on a simple scraper that crawls a Youtube video's comment page. The crawler uses Ajax to go through every comment on a Youtube Videos comment page and then saves them to a json file. Even with small number of comments (< 10), it still takes 3+ min for the comments to be parsed.
I've tried including request-cache and using ujson instead of json to see if there are any benefits but there's no noticeable difference.
Here's the code I'm using currently:
import os
import sys
import time
import ujson
import requests
import requests_cache
import argparse
import lxml.html
requests_cache.install_cache('comment_cache')
from lxml.cssselect import CSSSelector
YOUTUBE_COMMENTS_URL = 'https://www.youtube.com/all_comments?v={youtube_id}'
YOUTUBE_COMMENTS_AJAX_URL = 'https://www.youtube.com/comment_ajax'
def find_value(html, key, num_chars=2):
pos_begin = html.find(key) + len(key) + num_chars
pos_end = html.find('"', pos_begin)
return html[pos_begin: pos_end]
def extract_comments(html):
tree = lxml.html.fromstring(html)
item_sel = CSSSelector('.comment-item')
text_sel = CSSSelector('.comment-text-content')
photo_sel = CSSSelector('.user-photo')
for item in item_sel(tree):
yield {'cid': item.get('data-cid'),
'name': item.get('data-name'),
'ytid': item.get('data-aid'),
'text': text_sel(item)[0].text_content(),
'photo': photo_sel(item)[0].get('src')}
def extract_reply_cids(html):
tree = lxml.html.fromstring(html)
sel = CSSSelector('.comment-replies-header > .load-comments')
return [i.get('data-cid') for i in sel(tree)]
def ajax_request(session, url, params, data, retries=10, sleep=20):
for _ in range(retries):
response = session.post(url, params=params, data=data)
if response.status_code == 200:
response_dict = ujson.loads(response.text)
return response_dict.get('page_token', None), response_dict['html_content']
else:
time.sleep(sleep)
def download_comments(youtube_id, sleep=1, order_by_time=True):
session = requests.Session()
# Get Youtube page with initial comments
response = session.get(YOUTUBE_COMMENTS_URL.format(youtube_id=youtube_id))
html = response.text
reply_cids = extract_reply_cids(html)
ret_cids = []
for comment in extract_comments(html):
ret_cids.append(comment['cid'])
yield comment
page_token = find_value(html, 'data-token')
session_token = find_value(html, 'XSRF_TOKEN', 4)
first_iteration = True
# Get remaining comments (the same as pressing the 'Show more' button)
while page_token:
data = {'video_id': youtube_id,
'session_token': session_token}
params = {'action_load_comments': 1,
'order_by_time': order_by_time,
'filter': youtube_id}
if order_by_time and first_iteration:
params['order_menu'] = True
else:
data['page_token'] = page_token
response = ajax_request(session, YOUTUBE_COMMENTS_AJAX_URL, params, data)
if not response:
break
page_token, html = response
reply_cids += extract_reply_cids(html)
for comment in extract_comments(html):
if comment['cid'] not in ret_cids:
ret_cids.append(comment['cid'])
yield comment
first_iteration = False
time.sleep(sleep)
# Get replies (the same as pressing the 'View all X replies' link)
for cid in reply_cids:
data = {'comment_id': cid,
'video_id': youtube_id,
'can_reply': 1,
'session_token': session_token}
params = {'action_load_replies': 1,
'order_by_time': order_by_time,
'filter': youtube_id,
'tab': 'inbox'}
response = ajax_request(session, YOUTUBE_COMMENTS_AJAX_URL, params, data)
if not response:
break
_, html = response
for comment in extract_comments(html):
if comment['cid'] not in ret_cids:
ret_cids.append(comment['cid'])
yield comment
time.sleep(sleep)
def main(argv):
parser = argparse.ArgumentParser(add_help=False, description=('Download Youtube comments without using the Youtube API'))
parser.add_argument('--help', '-h', action='help', default=argparse.SUPPRESS, help='Show this help message and exit')
parser.add_argument('--youtubeid', '-y', help='ID of Youtube video for which to download the comments')
parser.add_argument('--output', '-o', help='Output filename (output format is line delimited JSON)')
parser.add_argument('--timeorder', '-t', action='store_true', help='Download Youtube comments ordered by time')
try:
args = parser.parse_args(argv)
youtube_id = args.youtubeid
output = args.output
start_time = time.time()
if not youtube_id or not output:
parser.print_usage()
raise ValueError('you need to specify a Youtube ID and an output filename')
print 'Downloading Youtube comments for video:', youtube_id
count = 0
with open(output, 'wb') as fp:
for comment in download_comments(youtube_id, order_by_time=bool(args.timeorder)):
print >> fp, ujson.dumps(comment, escape_forward_slashes=False)
count += 1
sys.stdout.write('Downloaded %d comment(s)\r' % count)
sys.stdout.flush()
elapsed_time = time.time() - start_time
print '\nDone! Elapsed time (seconds):', elapsed_time
except Exception, e:
print 'Error:', str(e)
sys.exit(1)
if __name__ == "__main__":
main(sys.argv[1:])
I'm new to Python so I'm not sure where the bottlenecks are. The finished script will be used to parse through 100,000+ comments so performance is a large factor.
Would using multithreading solve the issue? And if so how would I refactor this to benefit from it?
Is this strictly a network issue?
Yes, Multithreading will speed up the process. Run the network operations (ie. downloading) in a separate Thread.
Yes, it is a network related issue.
Your requests are I/O bound. You make a request to Youtube - it takes some time to get back the response, it's dependent mostly on the network, you can't make the process faster. However, you can use Threads to send multiple requests in parallel. That will not make the actual process faster but you will process more in less time.
Threading tutorial:
https://pymotw.com/2/threading/
http://www.tutorialspoint.com/python/python_multithreading.htm
An example somewhat similar to your task -- http://www.toptal.com/python/beginners-guide-to-concurrency-and-parallelism-in-python
Also since you will be doing a lot of scraping and processing, I would recommend using something like Scrapy - I personally use it for these kind of tasks.
Making multiple requests at once will speed up the process, but if it's taking 3 minutes to parse 10 comments you have some other issues and parsing 100,000 comments will take days. Unless there's a pressing reason to use lxml I'd suggest you look at BeautifulSoup and let it provide you with lists of the comment tags and their text content rather than doing it yourself. I'm guessing most of the slowness is in lxml transforming the content you pass to it and then in your manual counting to find positions in a string. I'm also suspicious of the calls to sleep-- what are those for?
Assuming this
print >> fp, ujson.dumps(comment, escape_forward_slashes=False)
count += 1
sys.stdout.write('Downloaded %d comment(s)\r' % count)
is just for debugging, move it into download_comments and use logging so you can turn it on and off. Dumping each individual comment to JSON as you go is going to be slow; you might want to start dumping these into a database now to avoid that. And re-examine why you're doing things one comment at a time: BeautifulSoup should give you a full list of comments & their text with each page load so you can handle them in batches which will be handy once you start parsing larger groups.
Related
I am making a python script using API of a free test automation website called TestProject.
Link to their API: https://api.testproject.io/docs/v2/
Basically what i want to do is grab pdf of reports of all tests and save them somewhere.
But to make the GET request to do that i first need projectID and jobID which i already wrote functions getting them and saving them in the array.
But now i have a problem where its looping through both lists and not using correct projectID and jobID and its throwing errors because it does not exist.
So what i need is something to check if jobID is in projectID so that way i can make a GET request to get all the executionID's to get the PDF of the report.
I am kinda new to programming so i would love any help i can get. If anyone has any better solutions please feel free to let me know.
My script:
import requests
import json
import csv
from datetime import datetime
from jsonpath_ng import jsonpath, parse
API_key = 'api_key'
headers = {'Authorization':'{}'.format(API_key)}
list_projectId = []
list_jobId = []
list_executionId = []
ParseData_projectId = parse('$..id')
ParseData_jobId = parse('$..id')
ParseData_executionId = parse('$..id')
def parsing (response,ParseData,list_data):
# parses data and appends it to the list
Data = json.loads(response)
Parsaj = ParseData
Podatki = Parsaj.find(Data)
for i in range(0, len(Podatki)):
vrednost = Podatki[i].value
list_data.append(vrednost)
def projectId():
# gets all projectId's and saves them in list_projectId
url = 'https://api.testproject.io/v2/projects?_start=0'
response = requests.get(url,headers=headers)
response_json = response.json()
converted = json.dumps(response_json)
parsing(converted,ParseData_projectId,list_projectId)
def jobId():
# gets all jobId's and saves them in list_jobId
for i in range(0, len(list_projectId)):
id = list_projectId[i]
url = 'https://api.testproject.io/v2/projects/{}'.format(id) + '/jobs?onlyScheduled=false&_start=0'
response = requests.get(url,headers=headers)
response_json = response.json()
converted = json.dumps(response_json)
parsing(converted,ParseData_jobId,list_jobId)
def executionId():
# Their API link:
# https://api.testproject.io/v2/projects/{projectId}/jobs/{jobId}/reports?_start=0
# the for loop below does not work here is where i need the help:
for i in range(0, len(list_projectId)):
project_id = list_projectId[i]
job_id = list_jobId[i]
url = 'https://api.testproject.io/v2/projects/{}'.format(project_id) + '/jobs/{}'.format(job_id) + '/reports?_start=0'
response = requests.get(url,headers=headers)
response_json = response.json()
converted = json.dumps(response_json)
parsing(converted,ParseData_executionId,list_executionId)
projectId()
print("----------LIST PROJECT ID: ----------")
print(list_projectId)
print("")
jobId()
print("----------LIST JOB ID: ----------")
print(list_jobId)
executionId()
print("----------LIST EXECUTION ID: ----------")
print(list_executionId)
you have to use 'in' operator to check the value exist in the list data structure.
the situation is that sometimes a request does not load or gets stuck in Python, in case that happens or any error occurs, I would like to retry it "n" times and wait up to a maximum of 3 seconds for each one and in case the attempts are over tell me a message that f"Could not process {type_1} and {type_2}". Everything runs in parallel with concurrent.futures. Could you help me with that?
import Requests
import concurrent.futures
import json
data = [['PEN','USD'],['USD','EUR']]
def currency(element):
type_1 =element[0]
type_2 = element[1]
s = requests.Session()
url = f'https://usa.visa.com/cmsapi/fx/rates?amount=1&fee=0&utcConvertedDate=07%2F26%2F2022&exchangedate=07%2F26%2F2022&fromCurr={type_1}&toCurr={type_2}'
a = s.get(url)
response = json.loads(a)
value = response["convertedAmount"]
return value
with concurrent.futures.ProcessPoolExecutor() as executor:
results = executor.map(
currency, data)
for value in results:
print(value)
Your code is almost there. Here, I modified a few things:
from concurrent.futures import ThreadPoolExecutor
import time
import requests
def convert_currency(tup):
from_currency, to_currency = tup
url = (
"https://usa.visa.com/cmsapi/fx/rates?amount=1&fee=0"
"&utcConvertedDate=07%2F26%2F2022&exchangedate=07%2F26%2F2022&"
f"fromCurr={from_currency}&toCurr={to_currency}"
)
session = requests.Session()
for _ in range(3):
try:
response = session.get(url, timeout=3)
if response.ok:
return response.json()["convertedAmount"]
except requests.exceptions.ConnectTimeout:
time.sleep(3)
return f"Could not process {from_currency} and {to_currency}"
data = [["VND", "XYZ"], ['PEN','USD'], ["ABC", "XYZ"], ['USD','EUR'], ["USD", "XXX"]]
with ThreadPoolExecutor() as executor:
results = executor.map(convert_currency, data)
for value in results:
print(value)
Notes
I retried 3 times (see the for loop)
Use timeout= to specify the time out (in seconds)
The .ok attribute will tell if the call was successful
No need to import json as the response object can JSON decode with the .json() method
You might experiment between ThreadPoolExecutor and ProcessPoolExecutor to see which one performs better
The code below is a sample from my complete program, I tried it to make understandable.
It sends requests to a REST API. It starts with an URL and the number of pages for this specific search and tries to catch the content for each page.
Each page has several results. Each result becomes a FinalObject.
Because there are as many API requests as there are pages, I decided to use multi-threading and the concurrent.futures module.
=> It works but, as I'm new in coding and Python, I still have these 2 questions:
How to use ThreadPoolExecutor sequentially in this case,
Is there a better way to handle multi-threading in this case?
from concurrent.futures import ThreadPoolExecutor
from requests import get as re_get
def main_function(global_page_number, headers, url_request):
# create a list of pages number
pages_numbers_list = [i for i in range(global_page_number)]
# for each page, call the page_handler (MultiThreading)
with ThreadPoolExecutor(max_workers=10) as executor:
for item in pages_numbers_list:
executor.submit(
page_handler,
item,
url_request,
headers
)
def page_handler(page_number, url_request, headers):
# we change the page number in the url request
url_request = change_page(url_request, page_number)
# new request with the new url
result = re_get(url_request, headers=headers)
result = result.json()
# in the result, with found the list of dict in order to create the
# final object
final_object_creation(result['results_list'])
def change_page(url_request, new_page_number):
"to increment the value of the 'page=' attribute in the url"
current_nb_page = ''
start_nb = url_request.find("page=") + len('page=')
while 1:
if url_request[start_nb].isdigit():
current_nb_page = url_request[start_nb]
else:
break
new_url_request = url_request.replace("page=" + current_nb_page,
"page=" + str(new_page_number))
return new_url_request
def final_object_creation(results_list):
'thanks to the object from requests.get(), it builts the final object'
global current_id_decision, dict_decisions
# each item in the results lis should be an instance of the final object
for item in results_list:
# On définit l'identifiant du nouvel objet Decision
current_id_decision += 1
new_id = current_id_decision
# On crée l'objet Décision et on l'ajoute au dico des décisions
dict_decisions[new_id] = FinalObject(item)
class FinalObject:
def __init__(self, content):
self.content = content
current_id_decision = 0
dict_decisions = {}
main_function(1000, "headers", "https://api/v1.0/search?page=0&query=test")
I have this snippet
config = {10: 'https://www.youtube.com/', 5: 'https://www.youtube.com/', 7: 'https://www.youtube.com/',
3: 'https://sportal.com/', 11: 'https://sportal.com/'}
def test(arg):
for key in arg.keys():
requests.get(arg[key], timeout=key)
test(config)
On that way the things are happaning synchronously. I want to do it аsynchronously. I want to iterate through the loop without waiting for response for each address and to go ahead to the next one. And so until I iterate though all addresses in dictionary. Than I want to wait until I get all responses for all addresses and after that to get out of test function. I know that I can do it with threading but I read that with asyncio lyb it can be done better, but I couldn't implement it. If anyone have even better suggestions I am open for them. Here is my try:
async def test(arg):
loop = asyncio.get_event_loop()
tasks = [loop.run_in_executor(requests.get(arg[key], timeout=key) for key in arg.keys())]
await asyncio.gather(*tasks)
asyncio.run(test(config))
Here is the solution:
def addresses(adr, to):
requests.get(adr, timeout=to)
async def test(arg):
loop = asyncio.get_event_loop()
tasks = [loop.run_in_executor(None, addresses, arg[key], key) for key in arg.keys()]
await asyncio.gather(*tasks)
asyncio.run(test(config))
Now it works аsynchronously with lyb asyncio not with threading.
Some good answers here. I had trouble with this myself (I do a lot of webscraping) and so I created a package to help me async-scrape (https://pypi.org/project/async-scrape/).
It supports GET and POST. I tried to make it as easy to use as possible. You just need to specify a handler function for the response when you instantiate and then use the scrape_all method to do the work.
It uses the term scrape becasue i've build in some handlers for common errors when scraping websites.
You can do some things in it as well like limit the call rate if you find you're getting blocked.
An example of it's use is:
# Create an instance
from async_scrape import AsyncScrape
def post_process(html, resp, **kwargs):
"""Function to process the gathered response from the request"""
if resp.status == 200:
return "Request worked"
else:
return "Request failed"
async_Scrape = AsyncScrape(
post_process_func=post_process,
post_process_kwargs={},
fetch_error_handler=None,
use_proxy=False,
proxy=None,
pac_url=None,
acceptable_error_limit=100,
attempt_limit=5,
rest_between_attempts=True,
rest_wait=60,
call_rate_limit=None,
randomise_headers=True
)
urls = [
"https://www.google.com",
"https://www.bing.com",
]
resps = async_Scrape.scrape_all(urls)
To do this inside a loop i collect the results and add then to a set and pop off the old ones.
EG
from async_scrape import AsyncScrape
from bs4 import BeautifulSoup as bs
def post_process(html, resp, **kwargs):
"""Function to process the gathered response from the request"""
new_urls = bs.findall("a", {"class":"new_link_on_website"}
return [new_urls, resp]
async_Scrape = AsyncScrape(
post_process_func=post_process,
post_process_kwargs={}
)
# Run the loop
urls = set(["https://initial_webpage.com/"])
processed = set()
all_resps = []
while len(urls):
resps = async_scrape.scrape_all(urls)
# Get failed urls
success_reqs = set([
r["req"] for r in resps
if not r["error"]
])
errored_reqs = set([
r["req"] for r in resps
if r["error"]
])
# Get what you want from the responses
for r in success_reqs:
# Add found urls to urls
urls |= set(r["func_resp"][0]) # "func_resp" is the key to the return from your handler function
# Collect the response
all_resps.extend(r["func_resp"][1])
# Add to processed urls
processed.add(r["url"]) # "url" is the key to the url from the response
# Remove processed urls
urls = urls - processed
Check The Gist
Whenever I change limit attribute of the object and call crawl function, code terminates.
However, code works if I don't change the limit attribute:
from api import Crawler
if __name__ == "__main__":
mycrawler = Crawler("http://metasozluk.com/")
mycrawler.crawl()
Let me explain a bit more about code. This is -so called- a simple crawler I tried to code. limit[0] attribute is the limit of crawling a page. limit[1] is added by 1 whenever the crawl function finishes as you can see at the line 54 of api.py.
From 26th to 31st lines of api.py, I check if there's a limit and if there is, make sure if limit[0] and limit[1] are equal (and if equal, function returns).
However, if I determine a limit as in app.py, the code does not run mycrawler.crawl() function and terminates, while it works if I do like above. I don't know what is the problem here.
Basic Explanation
What I want to do is:
Check if there is a limit.
If there is, look how many times did crawling run?
If limit and runtimes equal, return.
If there is not, check crawling queue.
If there is no URL to crawl, return.
Edit: Sorry, i (think) have to do some editing to the code. :/
Does this works for you?
import re, requests, logging, os, time, json
from bs4 import BeautifulSoup as bs
class Crawler(object):
logger = logging.getLogger("crawler_logger")
logger.setLevel(logging.DEBUG)
file_handler = logging.FileHandler(os.getcwd()+"/logs/{}.log".format( str(int(round(time.time()*1000))) ))
file_handler.setLevel(logging.DEBUG)
terminal_handler = logging.StreamHandler()
terminal_handler.setLevel(logging.INFO)
log_format = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
file_handler.setFormatter(log_format)
terminal_handler.setFormatter(log_format)
logger.addHandler(terminal_handler)
logger.addHandler(file_handler)
def __init__(self, *args):
self.logger.info("Initializing Crawler object on '{}' domain with {} urls.".format(args[0], str(args)))
self.domain = args[0]
self.urls = list(args)
self.crawled = []
self.limit = [0, 0]
self.dump_file = "urls.json"
def crawl(self):
# while urls in self.urls
while self.urls:
if self.limit[0] != 0:
if self.limit[0] == self.limit[1]:
self.logger.info("Limit reached, writing to file and returns.")
with open(self.dump_file, "w") as dump_file:
dump_file.write(json.dumps(self.urls))
return
try:
self.logger.info("Connecting to {}...".format(self.urls[0]))
response = requests.get(self.urls[0])
response.encoding = "utf-8"
self.logger.info("Analyzing to structures...")
soup = bs(response.text, "html.parser")
links = soup.find_all("a", {"href" : re.compile("^/")})
hrefs = [x.attrs["href"] for x in links]
self.logger.info("Links are checked if they are crawled...")
for href in hrefs:
if self.domain[0:-1]+href in self.crawled:
self.logger.warn("{} already crawled.".format(str(href)))
pass
else: self.urls.append(self.domain[0:-1]+href)
self.crawled.append(self.urls[0])
# Remove first url from reversed self.urls list
self.urls[::-1].pop()
self.limit[1]+=1
self.logger.info("Reached count {}/{}".format(str(self.limit[1]), str(self.limit[0])))
except Exception as e:
self.logger.error("Crawling function raised an error, passing: {}".format(str(e)))
if len(self.urls) == 0:
self.logger.info("No url left to crawl, returns.")
with open(self.dump_file, "w+") as dump_file:
dump_file.write(json.dumps(self.urls))
return
if __name__ == "__main__":
mycrawler = Crawler("http://metasozluk.com/")
mycrawler.limit[0] = 5
mycrawler.crawl()