Optimize scraping through list of urls and write to csv - python

With a csv of 20k+ urls I want to scrape and find the html element "super-attribute-select". If found, write the url to column A, along with the product number(sku) to column B. If not found, write url to column C and sku to column D. Finally, save the dataframe to a csv file.
If i run the following code it works, but my program runs out of memory. It liked to find a way to optimize this. Now ~1500 urls take 5 hrs to process. While the entire csv is 20k.
import urllib.request
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from pandas import Series
urlList = pd.read_csv(r"url.csv")
urlList = urlList.url.tolist()
notfound = []
found = []
skulist =[]
skumissinglist =[]
# Function scrape, pass url, open with soup, and find class
def scrape(url):
tag ='select'
classused = "super-attribute-select"
d = dict(A=np.array(found), B=np.array(skulist), C=np.array(notfound), D=np.array(skumissinglist))
try:
content = urllib.request.urlopen(url)
soup = BeautifulSoup(content, features="html.parser")
sku= soup.find("div", {"itemprop": "sku"}).string
result = soup.find(tag, class_=classused)
#soup returns None if can't find anything
if result == None:
notfound.append(url)
skumissinglist.append(sku)
else:
found.append(url)
skulist.append(sku)
except:
result = print("Some extraction went wrong")
df = pd.DataFrame(dict([(k, Series(v)) for k, v in d.items()]))
df = df.to_csv('Test.csv')
for i in urlList:
scrape(i)

If I were doing this, I would try a few things:
(1) Update a dictionary instead of appending to a list. I think dictionaries are faster and more memory-efficient than lists.
(2) Rather than export each URL result as a CSV with the same name, either (a) preferred: wait until you are done to export all results as a single CSV, or (b) worse: maybe export them to different filenames by using f-strings instead of overwriting 'Test.csv' every time.

You could use a pool either with gevent or the built in one from urllib3 (or requests). Then you could do 10, or 100 a time depending on poolsize, and use an async queue to get remaining ones as the pools get exhausted.
from gevent import monkey, spawn, joinall
monkey.patch_all()
from gevent.pool import Pool as GeventPool
import pandas as pd
from pandas import Series
import numpy as np
import requests
from bs4 import BeautifulSoup
urlList = pd.read_csv(r"url.csv")
urlList = urlList.url.tolist()
pool = GeventPool(10)
notfound = []
found = []
skulist =[]
skumissinglist =[]
count = len(urllist)
# Function scrape, pass url, open with soup, and find class
def scrape(url):
tag ='select'
classused = "super-attribute-select"
d = dict(A=np.array(found), B=np.array(skulist), C=np.array(notfound), D=np.array(skumissinglist))
try:
content = requests.get(url).text
soup = BeautifulSoup(content, features="html.parser")
sku= soup.find("div", {"itemprop": "sku"}).string
result = soup.find(tag, class_=classused)
#soup returns None if can't find anything
if result == None:
notfound.append(url)
skumissinglist.append(sku)
else:
found.append(url)
skulist.append(sku)
except:
print("Some extraction went wrong")
df = pd.DataFrame(dict([(k, Series(v)) for k, v in d.items()]))
return df.to_csv('Test.csv')
pool.map(scrape, urllist)

Related

Trying to get data from a table using beautifulsoup in python

Trying to get the "all splits" line of numbers from https://insider.espn.com/nba/player/splits/_/id/532/type/nba/year/2003/category/perGame (html code is in the picture) my code returns the 'all splits' text instead of the numbers I'm looking for. How do I go about changing the lookups in the GetStats function area to get the numbers instead of the first column descriptors.
import requests
from bs4 import BeautifulSoup
import re
from concurrent.futures import ThreadPoolExecutor
import pandas as pd
import csv
urls = []
data = []
for year in range(2003, 2005):
for page in range(1, 9):
url = f'http://www.espn.com/nba/hollinger/statistics/_/page/{page}/year/{year}/qualified/false'
if url is not None:
urls.append(url)
def GetData(url):
names_list = [] # names of players
pers = [] # player efficency ratings
playeridlist = [] # list of player ids to be used in making new stats searchable url
statsurls = [] # list of urls generated to get player stats
# makes a pattern for the function to look for
pattern = re.compile('playerId=(\d+)')
# setsup soup function
req = requests.get(url)
soup = BeautifulSoup(req.text, 'lxml')
# finds players names and adds to list
names = soup.find(lambda tag: tag.name == 'a' and 'playerId' in tag['href'])
bodytext = names.text
names_list.append(bodytext)
# finds plays player efficency rating and adds to list
pertag = soup.find('td', class_='sortcell')
per = pertag.text
pers.append(per)
# finds player id
names = soup.find('a', href=pattern)
player_id = names['href'].split('playerId=')[1]
playeridlist.append(player_id)
# uses player id to make a list of new urls for that player and get stats
for player_id in playeridlist:
statsurl = f"https://insider.espn.com/nba/player/splits/_/id/{player_id}/type/nba/year/{year}/category/perGame"
if statsurl is not None:
statsurls.append(statsurl)
# parses stats to get stats
def GetStats(statsurl): # GO BACK AND MAKE A THREAD EXECUTER STATEMENT WITHIN GETDATA FUNCTION BELOW THIS!!!
statsreq = requests.get(statsurl)
statssoup = BeautifulSoup(statsreq.text, 'lxml')
focusing_search = statssoup.find('tr', class_='Table__TR Table__TR--sm Table__even', attrs={'data-idx': '1'})
playerstathtml = focusing_search.find('td', class_='Table__TD')
stat_values = [playerstats.text for playerstats in playerstathtml]
print(stat_values)
GetStats("https://insider.espn.com/nba/player/splits/_/id/532/type/nba/year/2003/category/perGame")
#name_and_stats_list = dict(map(lambda i, j: (i, j), names_list, pers))
print(f"{bodytext}: {per}")
print(player_id)
GetData('http://www.espn.com/nba/hollinger/statistics/_/page/1/year/2003/qualified/false')
To get the all_splits stats from:
https://insider.espn.com/nba/player/splits/_/id/532/type/nba/year/2003/category/perGame
This is what I did:
I grabbed the table body using soup.select
Then I grabbed the headings and relevant stats by iterating through the columns/rows.
The list comprehension provides the text in list format, which is easy to convert to a dataframe.
Code:
import requests
from bs4 import BeautifulSoup
import pandas as pd
url = 'https://insider.espn.com/nba/player/splits/_/id/532/type/nba/year/2003/category/perGame'
soup = BeautifulSoup(requests.get(url).content, "html.parser")
t = soup.select('main#fittPageContainer div.Table__Scroller > table > tbody')
headings = [h.text for h in t[0].find_next('tr').find_all('td')]
all_splits = [h.text for h in t[0].find_all('tr')[1].find_all('td')]
df = pd.DataFrame([all_splits], columns=headings)
print(df)
Output:

How to loop through list of URL and download specific data from them in python BeautifulSou

here I have a list of URL and I am trying to get "td" from all of them, but am only able to get the last URL's HTML.
import numpy as np
import pandas as pd
from datetime import datetime
import pytz
import requests
import json
from bs4 import BeautifulSoup
url_list = ['https://www.coingecko.com/en/coins/ethereum/historical_data/usd?start_date=2021-08-06&end_date=2021-09-05#panel',
'https://www.coingecko.com/en/coins/cardano/historical_data/usd?start_date=2021-08-06&end_date=2021-09-05#panel',
'https://www.coingecko.com/en/coins/chainlink/historical_data/usd?start_date=2021-08-06&end_date=2021-09-05#panel']
for link in range(len(url_list)):
response = requests.get(url_list[link])
src = response.content
soup = BeautifulSoup(response.text , 'html.parser')
res1 = soup.find_all( "td", class_ = "text-center")
res1
could anyone please help me how to get data of all URLs ?
You are overwriting your soup variable through each iteration the loop. So instead of saving all the results from each url and then looping over those, you are only going to get the final result.
Create a variable before the loop to store the results of each
iteration
Append the soup to that new variable each iteration
create a new loop to interact with your stored data
and you can access each element in a list with:
for url in url_list:
response = requests.get(url)
# rest of code
Easier to read
So
# empty list to store all results
results = []
# your loop here
for u in url_list:
response = requests.get(url)
src = response.content
soup = BeautifulSoup(response.text , 'html.parser')
results.append(soup.find_all( "td", class_ = "text-center"))
# Accessing the data from the results
for result in results:
print(result)

How do I write my data along the columns in a CSV file?

When I write to the csv file all of my data is printed in only the first column. Using my loop, how do I iterate along the columns to write the data?
import csv
import bs4
import urllib
from urllib.request import urlopen as uReq
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup as soup
#For sites that can't be opened due to Urllib blocker, use a Mozilla User agent to get access
pageRequest = Request('https://coronavirusbellcurve.com/', headers = {'User-Agent': 'Mozilla/5.0'})
htmlPage = urlopen(pageRequest).read()
page_soup = soup(htmlPage, 'html.parser')
specificDiv = page_soup.find("div", {"class": "table-responsive-xl"})
TbodyStats = specificDiv.table.tbody.tr.contents
TbodyDates = specificDiv.table.thead.tr.contents
with open('CovidHTML.csv','w', newline= '') as file:
theWriter = csv.writer(file)
theWriter.writerow(['5/4', ' 5/5', ' 5/6',' 5/7',' 5/8',' 5/9'])
for i in range(3,len(TbodyStats)):
if i%2 != 0:
theWriter.writerow([TbodyStats[i].text])
Another method, For reference only.
from simplified_scrapy import SimplifiedDoc,utils,req
html = req.get('https://coronavirusbellcurve.com/')
doc = SimplifiedDoc(html)
specificDiv = doc.select('div.table-responsive-xl') # Get first div. If you want to get all divs, use this method: doc.selects('div.table-responsive-xl')
# TbodyStats = specificDiv.tbody.trs.selects('td|th').text # Get data
# TbodyDates = specificDiv.thead.trs.selects('td|th').text # Get date
data = specificDiv.table.trs.selects('td|th').text # Get all
rows = []
for row in data:
rows.append(row[1:])
utils.save2csv('test.csv',rows)
Result:
5/5,5/6,5/7,5/8,5/9
1213260,1237960,1266822,1294664,1314610
24423,24700,28862,27842,19946
2.05%,2.04%,2.33%,2.20%,1.54%
I think you may be able to do this (I can't test for sure because I don't have your exact data on hand):
row = []
for i in range(3, len(TbodyStats), 2):
row.append(TbodyStats[i].text)
if len(row) == 6:
theWriter.writerow(row)
row = []
I added the 'step' to your range so you don't have to use % for finding odd numbered indices, then just built each row until it hits 6 members, then flush that to the csv file, then empty the row so you can repeat the process.

Beautifulsoup - Problems for webcrawler

How to correctly output all links on this news website? (in list form)
After output in list form, how can I return the result randomly (3~5 links a time)
note: the code I need starts from line 739 (nearly it may change a bit cause it refresh everyday)
div class="abdominis rlby clearmen"
and I need every link inside this kind of thing
<a href="https://tw.news.appledaily.com/life/realtime/20180308/1310910/>
Thanks!! the code is below:
from bs4 import BeautifulSoup
from flask import Flask, request, abort
import requests
import re
import random
import types
target_url = 'http://www.appledaily.com.tw/realtimenews/section/new/'
print('Start parsing appleNews....')
rs = requests.session()
res = rs.get(target_url, verify=False)
soup = BeautifulSoup(res.text, 'html.parser')
#can output all links but with useless information
contents = soup.select("div[class='abdominis rlby clearmen']")[0].find_all('a')
print(contents)
#can output single link but not in list form
#contents = soup.select("div[class='abdominis rlby clearmen']")[0].find('a').get('href')
#print(contents)
Here is a solution which will append each link to a list if it is contained in the specified div..
from bs4 import BeautifulSoup
from flask import Flask, request, abort
import requests
import re
import random
import types
target_url = 'http://www.appledaily.com.tw/realtimenews/section/new/'
print('Start parsing appleNews....')
rs = requests.session()
res = rs.get(target_url, verify=False)
soup = BeautifulSoup(res.text, 'html.parser')
list_links = [] # Create empty list
for a in soup.select("div[class='abdominis rlby clearmen']")[0].findAll(href=True): # find links based on div
list_links.append(a['href']) #append to the list
print(a['href']) #Check links
for l in list_links: # print list to screen (2nd check)
print(l)
To create random links to be returned.
import random #import random module
random_list = [] #create random list if needed..
random.shuffle(list_links) #random shuffle the list
for i in range(5): # specify range (5 items in this instance)
try:
res = list_links.pop(random.randint(0, len(list_links))) # pop of each item randomly based on the size of the list
print(res) #print to screen..
random)list.append(res) # or append to random_list
except IndexError:
pass
One last edit as you asked for it to be returned..
Here it is as a function that returns a list of x amount of random links..
def return_random_link(list_, num):
""" Takes in a list and returns a random amount of items """
random.shuffle(list_)
random_list = []
for i in range(num):
try: # try to append to the list
r = list_.pop(random.randint(0, len(list_)))
random_list.append(r)
except IndexError: #except an IndexError (no items
return random_list # Return the list of items
return random_list
random_list = return_random_link(list_links, 5)
for i in random_list:
print(i)
If you want the link tag without its descendents, you can clear them:
for elm in contents:
elm.clear()
I image I'd be more interested in extracting just the links, though:
contents = [a['href'] for a in contents]
To get results in a random order, try using random.shuffle() and grabbing however many elements from the reshuffled list at a time you need.

Creating Large DataFrame from smaller DataFrames

I am having an issue with the structure of data as I get it off the PGA website. I have trouble putting the data into a dataframe and merging the data so that I can use the dataframe for analysis later. The dimensions of the scraped data are never right. I get a separate error each time I run the code that I cant seem to reconcile.
I have tried merging and concatenating dataframes but nothing seems to work. ANy help is appreciated
I would really like for my dataframe to contain the individual statistics from the separate sites but on the same row as the other data formatted by the year and PLAYER NAME.
import csv
from urllib.request import urlopen
from bs4 import BeautifulSoup
import datetime
import socket
import urllib.error
import pandas as pd
import urllib
import sqlalchemy
import numpy as np
import functools
base = 'http://www.pgatour.com/'
inn = 'stats/stat'
end = '.html'
years = ['2017','2016']
alpha = []
#all pages with links to tables
urls = ['http://www.pgatour.com/stats.html','http://www.pgatour.com/stats/categories.ROTT_INQ.html','http://www.pgatour.com/stats/categories.RAPP_INQ.html','http://www.pgatour.com/stats/categories.RARG_INQ.html','http://www.pgatour.com/stats/categories.RPUT_INQ.html','http://www.pgatour.com/stats/categories.RSCR_INQ.html','http://www.pgatour.com/stats/categories.RSTR_INQ.html','http://www.pgatour.com/stats/categories.RMNY_INQ.html','http://www.pgatour.com/stats/categories.RPTS_INQ.html']
for i in urls:
data = urlopen(i)
soup = BeautifulSoup(data, "html.parser")
for link in soup.find_all('a'):
if link.has_attr('href'):
alpha.append(base + link['href'][17:]) #may need adjusting
#data links
beta = []
for i in alpha:
if inn in i:
beta.append(i)
gamma = []
for i in beta:
if i not in gamma:
gamma.append(i)
jan = []
for i in gamma:
try:
data = urlopen(i)
soup = BeautifulSoup(data, "html.parser")
for table in soup.find_all('section',{'class':'module-statistics-off-the-tee-details'}):
for j in table.find_all('h3'):
y=j.get_text().replace(" ","").replace("-","").replace(":","").replace(">","").replace("<","").replace(">","").replace(")","").replace("(","").replace("=","").replace("+","")
jan.append([i,str(y+'.csv')])
print([i,str(y+'.csv')])
except Exception as e:
print(e)
pass
#my problem starts here
#using urls list so that I can find error faster
urls = [['http://www.pgatour.com/stats/stat.02356.html','d']
,['http://www.pgatour.com/stats/stat.02568.html','f']
,['http://www.pgatour.com/stats/stat.111.html','r']]
list = []
master = pd.DataFrame()
#jan = [['http://www.pgatour.com/stats/stat.02356.html', 'Last15EventsScoring.csv']]
#make a list with url and title name and cleaned csv name
#write to csv
row_sp = []
rows_sp =[]
title1 = []
title = []
for i in urls:
try:
for y in years:
data = urlopen(i[0][:-4] +y+ end)
soup = BeautifulSoup(data, "html.parser")
data1 = urlopen(i[0])
soup1 = BeautifulSoup(data1, "html.parser")
for table in soup1.find_all('table',{'id':'statsTable'}):
title.append('year')
for k in table.find_all('tr'):
for n in k.find_all('th'):
title1.append(n.get_text())
for l in title1:
if l not in title:
title.append(l)
rows_sp.append(title)
for table in soup.find_all('table',{'id':'statsTable'}):
for h in table.find_all('tr'):
row_sp = [y]
for j in h.find_all('td'):
row_sp.append(j.get_text().replace(" ","").replace("\n","").replace("\xa0"," "))
rows_sp.append(row_sp)
df=pd.DataFrame(rows_sp)
df.columns = title
df.drop(df.index[1],inplace = True)
print(df)
list.append(df)
except Exception as e:
print(e)
pass
df_merge = functools.reduce(lambda left,right: pd.merge(left,right,on=['year','PLAYER NAME'], how='outer'), list)

Categories