web-scraping multiprocessing doesn't work

web-scraping multiprocessing doesn't work - python

I'm trying to use web-scraping on large number of urls and I apply the multiprocessing to speed up but don't know why it cannot speed up at all. Here is part of my code:
def scrape(url,output_path):
page = urlopen(URL)
soup = BeautifulSoup(page, 'html.parser')
item_text = soup.select('#scatter6001 script')[0].text
table = soup.find_all('table',{'class':'noborder dark'})
df1 = pd.read_html(str(table),header = 0)
df1 = pd.DataFrame(df1[0])
...
# function for scraping the data from url
rootPath = '...'
urlp1 = "https://www.proteinatlas.org/"
try:
df1 = pd.read_csv(rootPath + "cancer_list1_2(1).csv", header=0);
except Exception as e:
print("File " + f + " doesn't exist")
print(str(e))
sys.exit()
cancer_list = df1.as_matrix().tolist()
URLs = []
for cancer in cancer_list:
urlp2 = "/pathology/tissue/" + cancer[1]
f = cancer[0]
try:
df1 = pd.read_csv(rootPath + f + ".csv", header=0);
except Exception as e:
print("File " + f + " doesn't exist")
print(str(e))
sys.exit()
...
# list of URLs
if __name__ == '__main__':
pool = multiprocessing.Pool(processes=6)
records = p.map(scrape(url,output_path))
p.terminate()
p.join()
Not sure how to speed up the web-scraping using multiprocessing.

You're not actually using multiprocessing. You're running the scrape function once and passing the result as an argument to p.map(). Instead, you need to pass a callable taking one argument, for example:
func = lambda url: scrape(url, output_path)
p.map(func, list_of_urls)

Related

requests_html stop website from redirecting

I am trying to scrape the follow link https://9anime.to/watch/one-piece-dub.34r/r2wjlq using python/requests_html.
My problem is it gets auto redirected to the default server tab instead of the mp4upload tab, trying to find a fix for this but cant figure it out.
Below is the code
import re
import requests
import cloudscraper
from urllib import parse
from bs4 import BeautifulSoup
from requests_html import HTMLSession
base_url = 'https://9anime.to'
class nine_scraper:
def get_ep_links(url):
html = nine_scraper.get_html(url, True)
servers = html.find('div', id='servers-container')
if servers:
results = []
mp4upload_results = []
mp4upload = servers.find('div', attrs={'data-id': '35'})
mp4upload_eps = mp4upload.find_all('a', href=True)
for ep in mp4upload_eps:
x = (ep.get('href'), ep.text)
mp4upload_results.append(x)
for result in mp4upload_results:
results.append(base_url + result[0])
return results
else:
print('No servers found!!')
def get_series_info(url):
return
def get_servers(html):
return
def find_download(url):
html = nine_scraper.get_html(url, True)
def search(query):
if '&page=' in query:
query = query.split('&page=')
search_url = base_url + '/search?keyword=' + parse.quote(query[0]) + '&page=' + query[1]
else:
search_url = base_url + '/search?keyword=' + parse.quote(query)
html = nine_scraper.get_html(search_url, False)
film_list = html.find('div', class_='film-list')
if film_list:
results = []
prev_page = html.find('a', class_='pull-left')
next_page = html.find('a', class_='pull-right')
films = film_list.find_all('div', class_='inner')
for film in films:
results.append((film.find('a', class_='name').text.strip(), film.find('a', class_='name').get('href').strip()))
if prev_page.get('href'):
param = parse.urlsplit(base_url + '/' + prev_page.get('href')).query
url = parse.unquote_plus(param.replace('keyword=', ''), encoding='utf-8')
results.append(('Previous page', url))
if next_page.get('href'):
param = parse.urlsplit(base_url + '/' + next_page.get('href')).query
url = parse.unquote_plus(param.replace('keyword=', ''), encoding='utf-8')
results.append(('Next page', url))
return results
else:
print('No results found!')
def get_html(url, render_js=False): # Load webpage and return its html
try:
if render_js: # Check if page needs to render javascript, if so use 'requests_html'
session = HTMLSession() # Make a GET request to your webpage, using 'Requests'
resp = session.get(url, timeout=10)
resp.raise_for_status() # Raise an exception if respones doesnt come back 200-400
resp.html.render(timeout=10) # Render the javascript
html = BeautifulSoup(resp.html.html, 'html.parser') # Parse the html data we just got with 'BeautifulSoup4'
return html # Return the parsed html
else: # Use 'cloudscraper' since we dont need to load any javascript
c_scraper = cloudscraper.create_scraper() # Make a GET request to your webpage, using 'Requests'
resp = c_scraper.get(url)
resp.raise_for_status() # Raise an exception if respones doesnt come back 200-400
html = BeautifulSoup(resp.content, 'html.parser') # Parse the html data we just got with 'BeautifulSoup4'
return html # Return the parsed html
except requests.HTTPError as e:
print(f'HTTP error occurred: {e}')
except requests.ConnectionError as e:
print(f'Connection Error occurred: {e}')
except requests.Timeout as e:
print(f'Timeout Error occurred: {e}')
except requests.RequestException as e:
print(f'General Error occurred: {e}')
except Exception as e:
print(f'Other error occurred: {e}')
except KeyboardInterrupt:
print("Someone closed the program")
import sys
from os import system, name
from scrapers import nine_scraper
def screen_clear():
# for mac and linux(os.name is 'posix')
if name == 'nt':
_ = system('cls')
else:
_ = system('clear')
def main_menu():
while True:
screen_clear()
print('------9anime downloader------\n[1] Search \n[2] Download \n[3] Exit\n-----------------------------\n')
main_choice = input('Enter your choice [1-3] >')
if main_choice == '1':
search_menu()
break
elif main_choice == '2':
continue
elif main_choice == '3':
screen_clear()
sys.exit()
else:
continue
def search_menu(query=False):
screen_clear()
print('--------------9anime downloader/search--------------\n')
if query:
search_results = nine_scraper.search(query)
results_menu(search_results)
else:
query = input('Please enter the name of the anime >')
if query:
search_results = nine_scraper.search(query)
results_menu(search_results)
def results_menu(results):
for num, result in enumerate(results, 1):
title = result[0]
link = result[1]
if 'Previous page' not in title:
if 'Next page' in title:
n = True
print('[N] ' + title)
else:
print(f'[{num}] {title}')
else:
p = True
print('[P] ' + title)
print('[M] Main menu')
titles, links = map(list, zip(*results))
while True:
search_choice = input('Enter choice >')
try:
search_choice = int(search_choice)
if 1 <= search_choice <= len(results) + 1:
print(links[search_choice - 1])
print(titles[search_choice - 1])
ep_links = nine_scraper.get_ep_links(links[search_choice - 1])
for link in ep_links:
print(link)
nine_scraper.find_download(link)
# series_menu(links[search_choice - 1])
break
except ValueError:
if search_choice.lower() == 'm':
main_menu()
break
elif search_choice.lower() == 'p':
if p:
url = links[-2]
search_menu(url)
break
continue
elif search_choice.lower() == 'n':
if n:
url = links.pop()
search_menu(url)
break
continue
def series_menu(url):
info = nine_scraper.get_series_info()
main_menu()
I know it has to be some javascript that is redirecting the page but i cant figure out what i need to do in order to stop that, any help would be very appreciated!

Using requests_html you can set allow_redirects=False like this:
r = session.get(url,allow_redirects=False)
Now your request should go only to the requested URL.

Trying to scrape data off of a website using Python and Chromedriver, but it's returning a nonetype error for "find"

I am trying to scrape data off of WhoScored.com. I am not sure what is the best way to do it or if anyone is familiar with this particular website, but I have a Python script that is supposed to scrape the data.
Here is my code:
import time
import bs4
import selenium_func as sel
from helper_functions import read_from_file, append_to_file
TIERS_PATH = 'tiers_urls/tiers_urls.txt'
TEAMS_PATH = 'teams_urls/teams_urls.txt'
TEAMS_LOGS = 'teams_urls/teams_logs.txt'
"""
Functions
"""
def get_teams_urls(start_idx):
"""
Searches each tier and extracts all the teams' urls within that tier.
"""
server, driver = sel.start_server_and_driver()
tiers_urls = read_from_file(TIERS_PATH)
length = len(tiers_urls)
for tier in tiers_urls[start_idx:]:
error = False
teams_urls = []
try:
complete_url = sel.WHOSCORED_URL + tier
try:
driver.get(complete_url)
content = driver.page_source
soup = bs4.BeautifulSoup(''.join(content), 'lxml')
except Exception as e:
print('\n')
print("Problem accessing {}".format(tier))
print(str(e))
print('\n')
append_to_file("\nError accessing: " + tier + "\n", TEAMS_LOGS)
append_to_file("Index: " + str(tiers_urls.index(tier)), TEAMS_LOGS)
continue
stage = None
stages_div = soup.find('div', {'id':'sub-navigation'})
if stages_div != None:
stage_li = stages_div.find_all('li')[0]
if stage_li != None:
stage_href = stage_li.find('a', href=True)['href']
if stage_href != None:
stage = stage_href.split('/')[8]
if stage != None:
standings_table = soup.find('div', {'id':'standings-'+stage})
standings_tbody = standings_table.find(id='standings-'+stage+'-content')
teams_tr = standings_tbody.find_all('tr')
if len(teams_tr) > 0:
for tr in teams_tr:
team_td = tr.find_all('td')[1]
team_href = team_td.find('a', href=True)['href']
teams_urls.append(team_href)
except Exception as e:
print('\n')
print("Problem reading data from: {}".format(tier))
print(str(e))
print('\n')
append_to_file("\nError reading data from: " + tier + "\n", TEAMS_LOGS)
append_to_file("Index: " + str(tiers_urls.index(tier)), TEAMS_LOGS)
error = True
if error == False:
if len(teams_urls) > 0:
to_store = {tier:teams_urls}
append_to_file(str(to_store), TEAMS_PATH)
append_to_file("\nSuccessfully retrieved from: " + str(tiers_urls.index(tier)) + "/" + str(length), TEAMS_LOGS)
time.sleep(1)
sel.stop_server_and_driver(server, driver)
return
if __name__ == '__main__':
get_teams_urls(0)
I am trying to scrape data off of WhoScored.com and it opens up the website, but it returns this error:
'NoneType' object has no attribute 'find'
How do I fix this and successfully scrape the data ?

Sounds like you need some null/None-checks:
for tr in teams_tr:
team_td = tr.find_all('td')[1]
if team_td != None:
team_href = team_td.find('a', href=True)['href']
teams_urls.append(team_href)
You didn't check if team_td was None before calling find

How to execute single function in multiple thread and thread instance create in loop in python?

1) i have a list of product links and it contain 3385 links
2) i have a function get_pro_info(link) it take link of product and append item to the json file.
3) i want selenium open 5 browser and 5 link parallel and get information of product and append in a file or list..
or 3) selenium open 1 browser and 5 tab(having 5 links) and append file.
Question how can i apply threading on my code?
my code...
new_url=''
def get_pro_info(pro_url):
driver = webdriver.Chrome(executable_path=r'C:\Users\Beenu\PycharmProjects/chromedriver.exe')
try:
new_url = 'https://pk.studiobytcs.com' + pro_url
print('new product URL: ' + new_url)
driver.execute_script("window.open('');")
sleep(1)
# use to switch control
driver.switch_to.window(driver.window_handles[0])
# sleep(1)
driver.get(new_url)
except(WebDriverException, selenium.common.exceptions.TimeoutException, Exception) as e:
print('There is error in getting Product by URL in get_pro_info()! \n' + str(e.stacktrace))
pass
description_source_code = ''
# description_soup = BeautifulSoup()
description_soup: BeautifulSoup = object
# global description_soup
try:
# description_soup = BeautifulSoup('html.parser')
description: WebElement = driver.find_element_by_xpath(
'//*[#id="shopify-section-product-template"]/div[2]/div[1]/div/div[2]')
description_source_code = description.get_attribute("innerHTML")
description_soup: BeautifulSoup = BeautifulSoup(description_source_code, 'html.parser')
except NoSuchElementException as e:
print('Product description taag not found! \n' + str(e.stacktrace))
pass
# 179 here
# This is for getting heading product name
head = ''
r_j_title = ''
try:
head = description_soup.find_all("h1", class_="product_name")
# print(head)
r_j_title = head[0].string.strip()
print("Title: " + r_j_title)
except (HTMLParser, IndexError):
print('Fail to get heading/title Tag! \n' + str(HTMLParser))
# This is for get brand name from heading/title
r_j_brand_and_designer = ''
try:
brand_and_designer = head[0].string.strip().split("-")[0]
r_j_brand_and_designer = str(brand_and_designer).strip()
print('Brand and designer: ' + r_j_brand_and_designer)
except (IndexError, ValueError) as e:
print('Fail to Split Brand from heading/title ! \n' + str(e.stacktrace))
# This is for getting price in integer
r_j_price_in_int = ''
try:
price = description_soup.find_all("span", class_="money")
# print(price)
price_new = price[0].string.strip()
print("New price: " + price_new)
# this is for getting price from string
r_c_price = price[0].string.strip().split(".")[1]
r_j_price_in_int = str(r_c_price).replace(",", "")
# price could ha ,
print('Price: ' + r_j_price_in_int)
except (HTMLParser, IndexError, ValueError) as e:
print('Fail to get Tag or failed to Split Brand from heading/title ! \n' + str(e.stacktrace))
# this is for getting full description
description_all = ''
r_j_desc = ''
try:
description_all = description_soup.find_all("div", class_="description")
final_des = str(description_all[0].get_text())
ch = final_des.split()
r_j_desc = str(' '.join(ch))
print("with split ch : " + r_j_desc) # addtion of .string.strip()
except (HTMLParser, IndexError, ValueError) as e:
print('Fail to get all description Tag or failed to Split and removing endline chr from description ! \n' + str(
e.stacktrace))
# This is for trying if fibric tag is not avaliable
try:
get_split_fibric = description_all[0].get_text().split("Fabric", 1)[1]
get_split_des = get_split_fibric.split("Disclaimer")[0]
r_j_fabric = str(get_split_des).strip()
print("getting fibric: " + r_j_fabric)
except IndexError as e:
r_j_fabric = 'N/A'
print('Fabric is not avaliable: ' + r_j_fabric)
item['brand_name'] = str(r_j_brand_and_designer)
item['designer'] = str(r_j_brand_and_designer)
item['title'] = str(r_j_title)
item['description'] = str(r_j_desc)
item['price'] = int(r_j_price_in_int)
item['currency'] = "PKR"
item['product_id'] = str(r_j_title)
item['source'] = str(new_url)
item['fabric'] = str(r_j_fabric)
item['gender'] = "woman"
print(item)
cloth = {
"cloth": item
}
# instruction
print(cloth)
list_before_dump.append(cloth)
driver.close()
driver.quit()
with open('product_link_read.txt', 'r') as file:
data = file.readlines()
# rd_pro_link_list=rd_pro_link_list+data.replace('\n', '')
print(data)
for line in data:
# fap=
rd_pro_link_list.append(str(line).strip())
print(rd_pro_link_list)
print(len(rd_pro_link_list))
for pro_link in rd_pro_link_list:
get_pro_info(pro_link)
print('Pro count = ' + str(pro_count))
pro_count = pro_count + 1
list_before_dump_file.write(json.dumps(list_before_dump))
driver.close()
list_before_dump_file.close()

if you want to iterate list and get always 20 links then you can use range(start, stop, step) with step=20
all_t = []
for i in range(0, len(list_of_product_link), 20):
twenty_links = list_of_product_link[i:i+20]
t = threading.Thread(target=get_product_info, args=(twenty_links,))
t.start()
all_t.append(t)
# --- later ---
for t in all_t:
t.join()
or
for i in range(0, len(list_of_product_link), 20):
twenty_links = list_of_product_link[i:i+20]
all_t = []
for link in twenty_links:
t = threading.Thread(target=get_product_info, args=(link,))
t.start()
all_t.append(t)
# --- inside first `for` loop ---
for t in all_t:
t.join()
Other method is good if you will no need later your list
all_t = []
while list_of_product_link:
twenty_links = list_of_product_link[:20]
list_of_product_link = list_of_product_link[20:]
t = threading.Thread(target=get_product_info, args=(twenty_links,))
t.start()
all_t.append(t)
# --- later ---
for t in all_t:
t.join()
or
while list_of_product_link:
twenty_links = list_of_product_link[:20]
list_of_product_link = list_of_product_link[20:]
all_t = []
for link in twenty_links:
t = threading.Thread(target=get_product_info, args=(link,))
t.start()
all_t.append(t)
# --- inside first `for` loop ---
for t in all_t:
t.join()
BTW: args= needs tuple - even if you have only one arguments so you need , in ( ) to create tuple with one element.
BTW: If you want it to run only 20 threads in every moment then better see multiprocessing and Pool(20)
from multiprocessing import Pool
def get_product_info(link):
result = ....
return result
if __name__ == '__main__':
with Pool(20) as p:
all_results = p.map(get_product_info, list_of_product_link)

Scraping Instagram with API ?__a=1

I've been trying scraping Instagram posts for a certain hashtag for the keys: display_url, taken_at_timestamp, text, edge_liked_by. This goes perfect for some hundreds in the start, but then stops fetching 'text' keyword only. Other three fields are successfully fetched though. I am not sure why it's happening.
I am parsing the JSON https://www.instagram.com/explore/tags/something/?__a=1.
base_url = "https://www.instagram.com/explore/tags/salonedelmobile/?__a=1"
url = "https://www.instagram.com/explore/tags/salonedelmobile/?__a=1"
while True:
response = url_req.urlopen(url)
json_file = json.load(response)
for i in json_file['graphql']['hashtag']['edge_hashtag_to_media']['edges']:
try:
post_text = i['node']['edge_media_to_caption']['edges'][0]['node']['text']
except IndexError as e:
post_text = e
try:
display_url = i['node']['display_url']
except:
display_url = e
try:
like_count = i['node']['edge_liked_by']['count']
except:
like_count = e
try:
time_stamp = i['node']['taken_at_timestamp']
except:
time_stamp = e
output.append([display_url, like_count, time_stamp, post_text])
df = pd.DataFrame(output,columns=['URL', 'Like Count', 'Time', 'Text'])
try:
df.to_excel('instagram.xlsx')
except:
pass
if json_file['graphql']['hashtag']['edge_hashtag_to_media']['page_info']['has_next_page'] == True:
end_cursor = json_file['graphql']['hashtag']['edge_hashtag_to_media']['page_info']['end_cursor']
url = base_url + '&max_id=' + end_cursor
else:
break

Infinite Loop Exits Automatically while Using multithreading and Queues

I'm trying to run a function that has an infinite loop (to check data after few seconds of delay) using multithreading. Since I read data from a csv file, I'm also using Queues.
My current function fine when I do not use multithreading/queues but when I use them, the function only loops once and then stops.
Here's my function that has a infinite loop. Please note that the first while True loop is for threads (in case I use less number of threads than rows in csv) , the function only requires the second while True loop.
def doWork(q):
while True:
#logging.info('Thread Started')
row=q.get()
url = row[0]
target_price = row[1]
#logging.info('line 79')
while True:
delay=randint(5,10)
headers = {'User-Agent': generate_user_agent()}
print datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d %H:%M:%S')+': '+'Sleeping for ' + str(delay) + ' seconds'
#logging.info('line 81')
eventlet.sleep(delay)
try:
#logging.info('line 85')
with requests.Session() as s:
#logging.info('line 87')
with eventlet.Timeout(10, False):
page = s.get(url,headers=headers,proxies=proxyDict,verify=False)
#logging.info('line 89')
tree = html.fromstring(page.content)
#logging.info('line 91')
price = tree.xpath('//div[#class="a-row a-spacing-mini olpOffer"]/div[#class="a-column a-span2 olpPriceColumn"]/span[#class="a-size-large a-color-price olpOfferPrice a-text-bold"]/text()')[0]
title = tree.xpath('//h1/text()')[0]
#logging.info('line 93')
new_price = re.findall("[-+]?\d+[\.]?\d+[eE]?[-+]?\d*", price)[0]
#logging.info('line 95')
old_price = new_price
#logging.info('line 97')
#print price
print new_price
print title + 'Current price:' + new_price
if float(new_price)<float(target_price):
print 'Lower price found!'
mydriver = webdriver.Chrome()
send_simple_message()
login(mydriver)
print 'Old Price: ' + old_price
print 'New Price: ' + new_price
else:
print 'Trying again'
q.task_done()
except Exception as e:
print e
print 'Error!'
q.task_done()
And here is my thread driver function;
q = Queue(concurrent * 2)
if __name__ == "__main__":
for i in range(concurrent):
t = Thread(target=doWork,args=(q,))
t.daemon = True
t.start()
try:
with open('products.csv','r') as f:
reader = csv.reader(f.read().splitlines())
for row in reader:
q.put((row[0],row[1]))
q.join()
except KeyboardInterrupt:
sys.exit(1)

For anyone that is facing the same issue, here's how I solved it.
I removed q.task_done() from the while loop and put it outside the while loop. This is working as intended but I'm not sure if this is the right approach.

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

web-scraping multiprocessing doesn't work - python

You're not actually using multiprocessing. You're running the scrape function once and passing the result as an argument to p.map(). Instead, you need to pass a callable taking one argument, for example: func = lambda url: scrape(url, output_path) p.map(func, list_of_urls)

Related

requests_html stop website from redirecting

Trying to scrape data off of a website using Python and Chromedriver, but it's returning a nonetype error for "find"

How to execute single function in multiple thread and thread instance create in loop in python?

Scraping Instagram with API ?__a=1

Infinite Loop Exits Automatically while Using multithreading and Queues

Categories

Resources