I'm trying to use web-scraping on large number of urls and I apply the multiprocessing to speed up but don't know why it cannot speed up at all. Here is part of my code:
def scrape(url,output_path):
page = urlopen(URL)
soup = BeautifulSoup(page, 'html.parser')
item_text = soup.select('#scatter6001 script')[0].text
table = soup.find_all('table',{'class':'noborder dark'})
df1 = pd.read_html(str(table),header = 0)
df1 = pd.DataFrame(df1[0])
...
# function for scraping the data from url
rootPath = '...'
urlp1 = "https://www.proteinatlas.org/"
try:
df1 = pd.read_csv(rootPath + "cancer_list1_2(1).csv", header=0);
except Exception as e:
print("File " + f + " doesn't exist")
print(str(e))
sys.exit()
cancer_list = df1.as_matrix().tolist()
URLs = []
for cancer in cancer_list:
urlp2 = "/pathology/tissue/" + cancer[1]
f = cancer[0]
try:
df1 = pd.read_csv(rootPath + f + ".csv", header=0);
except Exception as e:
print("File " + f + " doesn't exist")
print(str(e))
sys.exit()
...
# list of URLs
if __name__ == '__main__':
pool = multiprocessing.Pool(processes=6)
records = p.map(scrape(url,output_path))
p.terminate()
p.join()
Not sure how to speed up the web-scraping using multiprocessing.
You're not actually using multiprocessing. You're running the scrape function once and passing the result as an argument to p.map(). Instead, you need to pass a callable taking one argument, for example:
func = lambda url: scrape(url, output_path)
p.map(func, list_of_urls)
Related
I am trying to scrape the follow link https://9anime.to/watch/one-piece-dub.34r/r2wjlq using python/requests_html.
My problem is it gets auto redirected to the default server tab instead of the mp4upload tab, trying to find a fix for this but cant figure it out.
Below is the code
import re
import requests
import cloudscraper
from urllib import parse
from bs4 import BeautifulSoup
from requests_html import HTMLSession
base_url = 'https://9anime.to'
class nine_scraper:
def get_ep_links(url):
html = nine_scraper.get_html(url, True)
servers = html.find('div', id='servers-container')
if servers:
results = []
mp4upload_results = []
mp4upload = servers.find('div', attrs={'data-id': '35'})
mp4upload_eps = mp4upload.find_all('a', href=True)
for ep in mp4upload_eps:
x = (ep.get('href'), ep.text)
mp4upload_results.append(x)
for result in mp4upload_results:
results.append(base_url + result[0])
return results
else:
print('No servers found!!')
def get_series_info(url):
return
def get_servers(html):
return
def find_download(url):
html = nine_scraper.get_html(url, True)
def search(query):
if '&page=' in query:
query = query.split('&page=')
search_url = base_url + '/search?keyword=' + parse.quote(query[0]) + '&page=' + query[1]
else:
search_url = base_url + '/search?keyword=' + parse.quote(query)
html = nine_scraper.get_html(search_url, False)
film_list = html.find('div', class_='film-list')
if film_list:
results = []
prev_page = html.find('a', class_='pull-left')
next_page = html.find('a', class_='pull-right')
films = film_list.find_all('div', class_='inner')
for film in films:
results.append((film.find('a', class_='name').text.strip(), film.find('a', class_='name').get('href').strip()))
if prev_page.get('href'):
param = parse.urlsplit(base_url + '/' + prev_page.get('href')).query
url = parse.unquote_plus(param.replace('keyword=', ''), encoding='utf-8')
results.append(('Previous page', url))
if next_page.get('href'):
param = parse.urlsplit(base_url + '/' + next_page.get('href')).query
url = parse.unquote_plus(param.replace('keyword=', ''), encoding='utf-8')
results.append(('Next page', url))
return results
else:
print('No results found!')
def get_html(url, render_js=False): # Load webpage and return its html
try:
if render_js: # Check if page needs to render javascript, if so use 'requests_html'
session = HTMLSession() # Make a GET request to your webpage, using 'Requests'
resp = session.get(url, timeout=10)
resp.raise_for_status() # Raise an exception if respones doesnt come back 200-400
resp.html.render(timeout=10) # Render the javascript
html = BeautifulSoup(resp.html.html, 'html.parser') # Parse the html data we just got with 'BeautifulSoup4'
return html # Return the parsed html
else: # Use 'cloudscraper' since we dont need to load any javascript
c_scraper = cloudscraper.create_scraper() # Make a GET request to your webpage, using 'Requests'
resp = c_scraper.get(url)
resp.raise_for_status() # Raise an exception if respones doesnt come back 200-400
html = BeautifulSoup(resp.content, 'html.parser') # Parse the html data we just got with 'BeautifulSoup4'
return html # Return the parsed html
except requests.HTTPError as e:
print(f'HTTP error occurred: {e}')
except requests.ConnectionError as e:
print(f'Connection Error occurred: {e}')
except requests.Timeout as e:
print(f'Timeout Error occurred: {e}')
except requests.RequestException as e:
print(f'General Error occurred: {e}')
except Exception as e:
print(f'Other error occurred: {e}')
except KeyboardInterrupt:
print("Someone closed the program")
import sys
from os import system, name
from scrapers import nine_scraper
def screen_clear():
# for mac and linux(os.name is 'posix')
if name == 'nt':
_ = system('cls')
else:
_ = system('clear')
def main_menu():
while True:
screen_clear()
print('------9anime downloader------\n[1] Search \n[2] Download \n[3] Exit\n-----------------------------\n')
main_choice = input('Enter your choice [1-3] >')
if main_choice == '1':
search_menu()
break
elif main_choice == '2':
continue
elif main_choice == '3':
screen_clear()
sys.exit()
else:
continue
def search_menu(query=False):
screen_clear()
print('--------------9anime downloader/search--------------\n')
if query:
search_results = nine_scraper.search(query)
results_menu(search_results)
else:
query = input('Please enter the name of the anime >')
if query:
search_results = nine_scraper.search(query)
results_menu(search_results)
def results_menu(results):
for num, result in enumerate(results, 1):
title = result[0]
link = result[1]
if 'Previous page' not in title:
if 'Next page' in title:
n = True
print('[N] ' + title)
else:
print(f'[{num}] {title}')
else:
p = True
print('[P] ' + title)
print('[M] Main menu')
titles, links = map(list, zip(*results))
while True:
search_choice = input('Enter choice >')
try:
search_choice = int(search_choice)
if 1 <= search_choice <= len(results) + 1:
print(links[search_choice - 1])
print(titles[search_choice - 1])
ep_links = nine_scraper.get_ep_links(links[search_choice - 1])
for link in ep_links:
print(link)
nine_scraper.find_download(link)
# series_menu(links[search_choice - 1])
break
except ValueError:
if search_choice.lower() == 'm':
main_menu()
break
elif search_choice.lower() == 'p':
if p:
url = links[-2]
search_menu(url)
break
continue
elif search_choice.lower() == 'n':
if n:
url = links.pop()
search_menu(url)
break
continue
def series_menu(url):
info = nine_scraper.get_series_info()
main_menu()
I know it has to be some javascript that is redirecting the page but i cant figure out what i need to do in order to stop that, any help would be very appreciated!
Using requests_html you can set allow_redirects=False like this:
r = session.get(url,allow_redirects=False)
Now your request should go only to the requested URL.
I am trying to scrape data off of WhoScored.com. I am not sure what is the best way to do it or if anyone is familiar with this particular website, but I have a Python script that is supposed to scrape the data.
Here is my code:
import time
import bs4
import selenium_func as sel
from helper_functions import read_from_file, append_to_file
TIERS_PATH = 'tiers_urls/tiers_urls.txt'
TEAMS_PATH = 'teams_urls/teams_urls.txt'
TEAMS_LOGS = 'teams_urls/teams_logs.txt'
"""
Functions
"""
def get_teams_urls(start_idx):
"""
Searches each tier and extracts all the teams' urls within that tier.
"""
server, driver = sel.start_server_and_driver()
tiers_urls = read_from_file(TIERS_PATH)
length = len(tiers_urls)
for tier in tiers_urls[start_idx:]:
error = False
teams_urls = []
try:
complete_url = sel.WHOSCORED_URL + tier
try:
driver.get(complete_url)
content = driver.page_source
soup = bs4.BeautifulSoup(''.join(content), 'lxml')
except Exception as e:
print('\n')
print("Problem accessing {}".format(tier))
print(str(e))
print('\n')
append_to_file("\nError accessing: " + tier + "\n", TEAMS_LOGS)
append_to_file("Index: " + str(tiers_urls.index(tier)), TEAMS_LOGS)
continue
stage = None
stages_div = soup.find('div', {'id':'sub-navigation'})
if stages_div != None:
stage_li = stages_div.find_all('li')[0]
if stage_li != None:
stage_href = stage_li.find('a', href=True)['href']
if stage_href != None:
stage = stage_href.split('/')[8]
if stage != None:
standings_table = soup.find('div', {'id':'standings-'+stage})
standings_tbody = standings_table.find(id='standings-'+stage+'-content')
teams_tr = standings_tbody.find_all('tr')
if len(teams_tr) > 0:
for tr in teams_tr:
team_td = tr.find_all('td')[1]
team_href = team_td.find('a', href=True)['href']
teams_urls.append(team_href)
except Exception as e:
print('\n')
print("Problem reading data from: {}".format(tier))
print(str(e))
print('\n')
append_to_file("\nError reading data from: " + tier + "\n", TEAMS_LOGS)
append_to_file("Index: " + str(tiers_urls.index(tier)), TEAMS_LOGS)
error = True
if error == False:
if len(teams_urls) > 0:
to_store = {tier:teams_urls}
append_to_file(str(to_store), TEAMS_PATH)
append_to_file("\nSuccessfully retrieved from: " + str(tiers_urls.index(tier)) + "/" + str(length), TEAMS_LOGS)
time.sleep(1)
sel.stop_server_and_driver(server, driver)
return
if __name__ == '__main__':
get_teams_urls(0)
I am trying to scrape data off of WhoScored.com and it opens up the website, but it returns this error:
'NoneType' object has no attribute 'find'
How do I fix this and successfully scrape the data ?
Sounds like you need some null/None-checks:
for tr in teams_tr:
team_td = tr.find_all('td')[1]
if team_td != None:
team_href = team_td.find('a', href=True)['href']
teams_urls.append(team_href)
You didn't check if team_td was None before calling find
1) i have a list of product links and it contain 3385 links
2) i have a function get_pro_info(link) it take link of product and append item to the json file.
3) i want selenium open 5 browser and 5 link parallel and get information of product and append in a file or list..
or 3) selenium open 1 browser and 5 tab(having 5 links) and append file.
Question how can i apply threading on my code?
my code...
new_url=''
def get_pro_info(pro_url):
driver = webdriver.Chrome(executable_path=r'C:\Users\Beenu\PycharmProjects/chromedriver.exe')
try:
new_url = 'https://pk.studiobytcs.com' + pro_url
print('new product URL: ' + new_url)
driver.execute_script("window.open('');")
sleep(1)
# use to switch control
driver.switch_to.window(driver.window_handles[0])
# sleep(1)
driver.get(new_url)
except(WebDriverException, selenium.common.exceptions.TimeoutException, Exception) as e:
print('There is error in getting Product by URL in get_pro_info()! \n' + str(e.stacktrace))
pass
description_source_code = ''
# description_soup = BeautifulSoup()
description_soup: BeautifulSoup = object
# global description_soup
try:
# description_soup = BeautifulSoup('html.parser')
description: WebElement = driver.find_element_by_xpath(
'//*[#id="shopify-section-product-template"]/div[2]/div[1]/div/div[2]')
description_source_code = description.get_attribute("innerHTML")
description_soup: BeautifulSoup = BeautifulSoup(description_source_code, 'html.parser')
except NoSuchElementException as e:
print('Product description taag not found! \n' + str(e.stacktrace))
pass
# 179 here
# This is for getting heading product name
head = ''
r_j_title = ''
try:
head = description_soup.find_all("h1", class_="product_name")
# print(head)
r_j_title = head[0].string.strip()
print("Title: " + r_j_title)
except (HTMLParser, IndexError):
print('Fail to get heading/title Tag! \n' + str(HTMLParser))
# This is for get brand name from heading/title
r_j_brand_and_designer = ''
try:
brand_and_designer = head[0].string.strip().split("-")[0]
r_j_brand_and_designer = str(brand_and_designer).strip()
print('Brand and designer: ' + r_j_brand_and_designer)
except (IndexError, ValueError) as e:
print('Fail to Split Brand from heading/title ! \n' + str(e.stacktrace))
# This is for getting price in integer
r_j_price_in_int = ''
try:
price = description_soup.find_all("span", class_="money")
# print(price)
price_new = price[0].string.strip()
print("New price: " + price_new)
# this is for getting price from string
r_c_price = price[0].string.strip().split(".")[1]
r_j_price_in_int = str(r_c_price).replace(",", "")
# price could ha ,
print('Price: ' + r_j_price_in_int)
except (HTMLParser, IndexError, ValueError) as e:
print('Fail to get Tag or failed to Split Brand from heading/title ! \n' + str(e.stacktrace))
# this is for getting full description
description_all = ''
r_j_desc = ''
try:
description_all = description_soup.find_all("div", class_="description")
final_des = str(description_all[0].get_text())
ch = final_des.split()
r_j_desc = str(' '.join(ch))
print("with split ch : " + r_j_desc) # addtion of .string.strip()
except (HTMLParser, IndexError, ValueError) as e:
print('Fail to get all description Tag or failed to Split and removing endline chr from description ! \n' + str(
e.stacktrace))
# This is for trying if fibric tag is not avaliable
try:
get_split_fibric = description_all[0].get_text().split("Fabric", 1)[1]
get_split_des = get_split_fibric.split("Disclaimer")[0]
r_j_fabric = str(get_split_des).strip()
print("getting fibric: " + r_j_fabric)
except IndexError as e:
r_j_fabric = 'N/A'
print('Fabric is not avaliable: ' + r_j_fabric)
item['brand_name'] = str(r_j_brand_and_designer)
item['designer'] = str(r_j_brand_and_designer)
item['title'] = str(r_j_title)
item['description'] = str(r_j_desc)
item['price'] = int(r_j_price_in_int)
item['currency'] = "PKR"
item['product_id'] = str(r_j_title)
item['source'] = str(new_url)
item['fabric'] = str(r_j_fabric)
item['gender'] = "woman"
print(item)
cloth = {
"cloth": item
}
# instruction
print(cloth)
list_before_dump.append(cloth)
driver.close()
driver.quit()
with open('product_link_read.txt', 'r') as file:
data = file.readlines()
# rd_pro_link_list=rd_pro_link_list+data.replace('\n', '')
print(data)
for line in data:
# fap=
rd_pro_link_list.append(str(line).strip())
print(rd_pro_link_list)
print(len(rd_pro_link_list))
for pro_link in rd_pro_link_list:
get_pro_info(pro_link)
print('Pro count = ' + str(pro_count))
pro_count = pro_count + 1
list_before_dump_file.write(json.dumps(list_before_dump))
driver.close()
list_before_dump_file.close()
if you want to iterate list and get always 20 links then you can use range(start, stop, step) with step=20
all_t = []
for i in range(0, len(list_of_product_link), 20):
twenty_links = list_of_product_link[i:i+20]
t = threading.Thread(target=get_product_info, args=(twenty_links,))
t.start()
all_t.append(t)
# --- later ---
for t in all_t:
t.join()
or
for i in range(0, len(list_of_product_link), 20):
twenty_links = list_of_product_link[i:i+20]
all_t = []
for link in twenty_links:
t = threading.Thread(target=get_product_info, args=(link,))
t.start()
all_t.append(t)
# --- inside first `for` loop ---
for t in all_t:
t.join()
Other method is good if you will no need later your list
all_t = []
while list_of_product_link:
twenty_links = list_of_product_link[:20]
list_of_product_link = list_of_product_link[20:]
t = threading.Thread(target=get_product_info, args=(twenty_links,))
t.start()
all_t.append(t)
# --- later ---
for t in all_t:
t.join()
or
while list_of_product_link:
twenty_links = list_of_product_link[:20]
list_of_product_link = list_of_product_link[20:]
all_t = []
for link in twenty_links:
t = threading.Thread(target=get_product_info, args=(link,))
t.start()
all_t.append(t)
# --- inside first `for` loop ---
for t in all_t:
t.join()
BTW: args= needs tuple - even if you have only one arguments so you need , in ( ) to create tuple with one element.
BTW: If you want it to run only 20 threads in every moment then better see multiprocessing and Pool(20)
from multiprocessing import Pool
def get_product_info(link):
result = ....
return result
if __name__ == '__main__':
with Pool(20) as p:
all_results = p.map(get_product_info, list_of_product_link)
I've been trying scraping Instagram posts for a certain hashtag for the keys: display_url, taken_at_timestamp, text, edge_liked_by. This goes perfect for some hundreds in the start, but then stops fetching 'text' keyword only. Other three fields are successfully fetched though. I am not sure why it's happening.
I am parsing the JSON https://www.instagram.com/explore/tags/something/?__a=1.
base_url = "https://www.instagram.com/explore/tags/salonedelmobile/?__a=1"
url = "https://www.instagram.com/explore/tags/salonedelmobile/?__a=1"
while True:
response = url_req.urlopen(url)
json_file = json.load(response)
for i in json_file['graphql']['hashtag']['edge_hashtag_to_media']['edges']:
try:
post_text = i['node']['edge_media_to_caption']['edges'][0]['node']['text']
except IndexError as e:
post_text = e
try:
display_url = i['node']['display_url']
except:
display_url = e
try:
like_count = i['node']['edge_liked_by']['count']
except:
like_count = e
try:
time_stamp = i['node']['taken_at_timestamp']
except:
time_stamp = e
output.append([display_url, like_count, time_stamp, post_text])
df = pd.DataFrame(output,columns=['URL', 'Like Count', 'Time', 'Text'])
try:
df.to_excel('instagram.xlsx')
except:
pass
if json_file['graphql']['hashtag']['edge_hashtag_to_media']['page_info']['has_next_page'] == True:
end_cursor = json_file['graphql']['hashtag']['edge_hashtag_to_media']['page_info']['end_cursor']
url = base_url + '&max_id=' + end_cursor
else:
break
I'm trying to run a function that has an infinite loop (to check data after few seconds of delay) using multithreading. Since I read data from a csv file, I'm also using Queues.
My current function fine when I do not use multithreading/queues but when I use them, the function only loops once and then stops.
Here's my function that has a infinite loop. Please note that the first while True loop is for threads (in case I use less number of threads than rows in csv) , the function only requires the second while True loop.
def doWork(q):
while True:
#logging.info('Thread Started')
row=q.get()
url = row[0]
target_price = row[1]
#logging.info('line 79')
while True:
delay=randint(5,10)
headers = {'User-Agent': generate_user_agent()}
print datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d %H:%M:%S')+': '+'Sleeping for ' + str(delay) + ' seconds'
#logging.info('line 81')
eventlet.sleep(delay)
try:
#logging.info('line 85')
with requests.Session() as s:
#logging.info('line 87')
with eventlet.Timeout(10, False):
page = s.get(url,headers=headers,proxies=proxyDict,verify=False)
#logging.info('line 89')
tree = html.fromstring(page.content)
#logging.info('line 91')
price = tree.xpath('//div[#class="a-row a-spacing-mini olpOffer"]/div[#class="a-column a-span2 olpPriceColumn"]/span[#class="a-size-large a-color-price olpOfferPrice a-text-bold"]/text()')[0]
title = tree.xpath('//h1/text()')[0]
#logging.info('line 93')
new_price = re.findall("[-+]?\d+[\.]?\d+[eE]?[-+]?\d*", price)[0]
#logging.info('line 95')
old_price = new_price
#logging.info('line 97')
#print price
print new_price
print title + 'Current price:' + new_price
if float(new_price)<float(target_price):
print 'Lower price found!'
mydriver = webdriver.Chrome()
send_simple_message()
login(mydriver)
print 'Old Price: ' + old_price
print 'New Price: ' + new_price
else:
print 'Trying again'
q.task_done()
except Exception as e:
print e
print 'Error!'
q.task_done()
And here is my thread driver function;
q = Queue(concurrent * 2)
if __name__ == "__main__":
for i in range(concurrent):
t = Thread(target=doWork,args=(q,))
t.daemon = True
t.start()
try:
with open('products.csv','r') as f:
reader = csv.reader(f.read().splitlines())
for row in reader:
q.put((row[0],row[1]))
q.join()
except KeyboardInterrupt:
sys.exit(1)
For anyone that is facing the same issue, here's how I solved it.
I removed q.task_done() from the while loop and put it outside the while loop. This is working as intended but I'm not sure if this is the right approach.