How to scrape all values of AJAX search table with PYTHON? - python

I am trying to scrape the CPU Specs Database at TechPowerUp.
I have found the table updates using AJAX and created the following code:
import requests
from bs4 import BeautifulSoup
import csv
import string
cpus = []
base = 'https://www.techpowerup.com/cpu-specs/?ajaxsrch='
letters = list(string.ascii_lowercase)
letters.extend(range(0, 10))
for i in letters:
URL = base + str(i)
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')
with open('cpu2.csv', mode='a') as cpu_csv:
headers = ['name', 'family', 'socket', 'release']
writer = csv.DictWriter(cpu_csv, fieldnames=headers, lineterminator='\n')
for tr in soup.find_all('tr')[1:]:
tds = tr.find_all('td')
if tds[0].text.strip() not in cpus:
writer.writerow({'name': tds[0].text.strip(), 'family': tds[1].text.strip(), 'socket': tds[4].text.strip(), 'release': tds[8].text.strip()})
cpus.append(tds[0].text.strip())
else:
print("duplicate")
This code works in the fact it loops through A-Z then 0-9 and populates a csv file ignoring duplicates however I'm getting a logical error where I am only scraping ~600 results where there are over 2000 entries.
I believe this may be due to a limit on the returned items for each AJAX Search request so not all entries are discovered, is there a different approach to fetch all results?
Thanks

import pandas as pd
import string
items = string.digits + string.ascii_lowercase
def main(url):
data = []
for item in items:
print(f"{item}")
df = pd.read_html(url.format(item))[0]
df = df[["Name", "Codename", "Socket", "Released"]]
data.append(df)
data = pd.concat(data)
data.drop_duplicates(subset='Name', keep="first",inplace=True)
data.to_csv("data.csv", index=False)
main("https://www.techpowerup.com/cpu-specs/?ajaxsrch={}")
Total Output is 596 based on removing duplicates By column Name.
View Online
Sample of output:

The easiest way to get the table data using pandas.Get the data in DataFrame and import into csv.
Code:
import string
import pandas as pd
base = 'https://www.techpowerup.com/cpu-specs/?ajaxsrch='
letters = list(string.ascii_lowercase)
letters.extend(range(0, 10))
df=pd.DataFrame()
for i in letters:
URL = base + str(i)
df1=pd.read_html(URL)[0]
df = df.append(df1, ignore_index=True)
print(df[['Name','Codename','Socket','Released']]) #This will give you 1739 records
#If you want to delete duplicates use this
df.drop_duplicates(subset='Name', keep='first', inplace=True)
print(df[['Name','Codename','Socket','Released']]) #This will give you 595 records
#Import into Csv file
df[['Name','Codename','Socket','Released']].to_csv("cpu_csv.csv",index=False)

Related

Writing different columns of a Pandas DataFrame in one row?

I have scraped a website for extracting the shoes and clothes prices , their image ids , image URLs and some other features,I succeeded in writing the dataframe to a csv file but I realized that the dataframe write every feature in different rows while they have to be gathered in one row , i have showed a sample output from my csv file below.
Any suggestions on how to change the code ??
from bs4 import BeautifulSoup
import requests
import re
import csv
import pandas as pd
import os
import urllib.request
df = pd.DataFrame(columns = ['PostID','Description', 'Kind', 'Price', 'ImageID', 'ImageURL'])
def scraping():
global h , df
with open("/home/user/Documents/file.txt") as f:
urls = f.readlines()
urls = ([s.strip('\n') for s in urls ])
code_list = []
for url in urls:
code = url.split('/')[-1]
code_list.append(code)
df = df.append({'PostID': code}, ignore_index=True)
for br in soup.find_all("br"):
br.replace_with("\n")
try:
description = soup.find('div', attrs={'class':'ui fluid card post-description'}).find('div', attrs={'class':'content'})
print(description.text)
df = df.append({'Description': description.text}, ignore_index=True)
item_list = []
items = soup.find_all('span', attrs={'class':'item__title'})
for i in items:
item_list.append(i.text)
item_list.pop(0)
value_list=[]
values = soup.find_all('div', attrs={'class':'value'})
for v in values:
value_list.append(v.text)
my_dictionary = {}
for i in range(1,3):
my_dictionary[item_list[i]] = value_list[i]
df = df.append({'Kind':my_dictionary['نوع آگهی'] }, ignore_index=True)
df = df.append({'Price': my_dictionary['قیمت']}, ignore_index=True)
imageresult = []
path = '/home/user/images'
images = soup.find_all('img')
for img in images:
imgID = img.get('src').split('/')[-1]
df = df.append({'ImageID': imgID}, ignore_index=True)
df = df.append({'ImageURL': img.get('src')}, ignore_index=True)
urllib.request.urlretrieve(img.get('src'), os.path.join(my_path, os.path.basename(img.get('src'))))
print(imgID + img.get('src'))
else:
break
except:
print("your URL is invalid :" + url)
scraping()
df.to_csv('divartest14.csv', index = False , encoding = 'utf-8')
PostID Description Kind Price ImageID
QXZ5RjZj
adidas shoes
feminine
100$
QXZ5RjZj.jpg
That will continue to happen because when you call append, you're telling it to ignore_index, therefore each series get's put in it's own row. I'd suggest passing in all of the items you want in one row in one dictionary, ie:
df = df.append({'c1': 1, 'c2': 2, 'c3': 3, ...etc})

Saving multiple data frames from loop

I have been searching for a solution to my problem, but all answers I find uses print() at the end of the answer, and NOT saving the data frames as I would like to.
Below I have a (almost) functioning code that prints 3 seperate tables. How do I save these three tables in 3 seperate data frames with the names matches_october, matches_november and matches_december?
The last line in my code is not working as I want it to work. I hope it is clear what I would like the code to do (Saving a data frame at the end of each of the 3 rounds in the loop)
import pandas as pd
import requests
from bs4 import BeautifulSoup
base_url = 'https://www.basketball-reference.com/leagues/NBA_2019_games-'
valid_pages = ['october','november','december']
end = '.html'
for i in valid_pages:
url = '{}{}{}'.format(base_url, i, end)
res = requests.get(url)
soup = BeautifulSoup(res.content,'lxml')
table = soup.find_all('table')[0]
df = pd.read_html(str(table))
print(df)
matches + valid_pages = df[0]
You can case it, but that's not very robust (and it's rather ugly).
if i == 'october':
matches_october = pd.read_html(str(table))
if i == 'november':
# so on and so forth
A more elegant solution is to use a dictionary. Before the loop, declare matches = {}. Then, in each iteration:
matches[i] = pd.read_html(str(table))
Then you can access the October matches DataFrame via matches['october'].
You can't compose variable names using +, try using a dict instead:
import pandas as pd
import requests
from bs4 import BeautifulSoup
matches = {} # create an empty dict
base_url = 'https://www.basketball-reference.com/leagues/NBA_2019_games-'
valid_pages = ['october','november','december']
end = '.html'
for i in valid_pages:
url = '{}{}{}'.format(base_url, i, end)
res = requests.get(url)
soup = BeautifulSoup(res.content,'lxml')
table = soup.find_all('table')[0]
df = pd.read_html(str(table))
print(df)
matches[i] = df[0] # store it in the dict
Thanks guys. That worked! :)
import pandas as pd
import requests
from bs4 import BeautifulSoup
matches = {} # create an empty dict
base_url = 'https://www.basketball-reference.com/leagues/NBA_2019_games-'
valid_pages = ['october','november','december']
end = '.html'
for i in valid_pages:
url = '{}{}{}'.format(base_url, i, end)
res = requests.get(url)
soup = BeautifulSoup(res.content,'lxml')
table = soup.find_all('table')[0]
df = pd.read_html(str(table))
matches[i] = df[0] # store it in the dict
matches_october = matches['october']

Extracting Several Tables to Excel From BeautifulSoup

I have extracted several tables from BeautifulSoup from different URL:s and I get them to print fine within python. However, when I try to extract it to excel, it doesn't work. The tables are stored in a list but I can't find a way to store them in excel.
I want every table in a different sheet but I can't print a list straight into excel and the tables.
I have a list of many url:s, but here is some of them to show how they look.
https://www.sec.gov/Archives/edgar/data/3197/000119312510083400/ddef14a.htm
https://www.sec.gov/Archives/edgar/data/3197/000119312511098071/ddef14a.htm
https://www.sec.gov/Archives/edgar/data/3197/000119312512157233/d293744ddef14a.htm
https://www.sec.gov/Archives/edgar/data/3197/000119312513152959/d469796ddef14a.htm
from bs4 import BeautifulSoup
import requests
import pandas as pd
import xlwt
xl = pd.ExcelFile(r'/path/to/file/with/links.xlsx')
link = xl.parse('Sheet1')
book = xlwt.Workbook()
list1 = []
for i in range(10,16):
try:
url = link['Link'][i]
html = requests.get(url).content
df_list = pd.read_html(html)
#I have matched up two keywords
soup = BeautifulSoup(html,'lxml')
table1 = soup.select_one('table:contains("Fees")')
table2 = soup.select_one('table:contains("Earned")')
if table1 == table2:
df = pd.read_html(str(table1))
list1.append(df)
#HERE BELOW IS WHERE THE PROBLEM IS
writer = pd.ExcelWriter('Tables_Fees_Earned.xlsx')
for counter in range(len(list1)):
sheet_name = 'Sheet%s' % counter
pd.Series(name = '').to_excel(writer, sheet_name=sheet_name)
for c in range(len(list1)):
list1[c].to_excel(writer,'Sheet%s' % counter)
writer.save()
else:
print(i)
The error is: AttributeError: 'list' object has no attribute 'to_excel'

Web-Scraping Python, Indexing Issue for DataFrame

I'm working on a web-scraper for Spotify Charts to extract the top 200 daily songs each day. I have done everything to extract the data I'm interested in including rank, artist, track title, and stream numbers. What I'm stuck on is putting everything into a DataFrame to export as a CSV to excel. Right now when I print my DataFrame, it is treating each cycle as 1 row with 4 columns as opposed to 200 rows with 4 columns.
I'm not sure what the issue is as I've tried just about everything and looked into it as much as I could. I know something is wrong with the indexing because each "what should be a row" has the same first "0" index, when they should go sequential to 199. Also, the column names for my DataFrame keep repeating after each "what should be a row", so I know there is definitely an issue there.
import requests
from bs4 import BeautifulSoup
from datetime import date, timedelta
from time import time
from time import sleep
from random import randint
import pandas as pd
import numpy as np
base_url = 'https://spotifycharts.com/regional/global/daily/'
r = requests.get(base_url)
soup = BeautifulSoup(r.text, 'html.parser')
chart = soup.find('table', {'class': 'chart-table'})
tbody = chart.find('tbody')
for tr in tbody.find_all('tr'):
rank_text = []
rank_text_elem = tr.find('td', {'class': 'chart-table-
position'})
for item in rank_text_elem:
rank_text = []
rank_text.append(item)
artist_text = []
artist_text_elem = tr.find('td', {'class': 'chart-table-
track'}).find_all('span')
for item in artist_text_elem:
artist_text = []
artist_text.append(item.text.replace('by ','').strip())
title_text = []
title_text_elem = tr.find('td', {'class': 'chart-table-
track'}).find_all('strong')
for item in title_text_elem:
title_text = []
title_text.append(item.text)
streams_text = []
streams_text_elem = tr.find('td', {'class': 'chart-table-streams'})
for item in streams_text_elem:
streams_text = []
streams_text.append(item)
# creating dataframe to store 4 variables
list_of_data = list(zip(rank_text, artist_text, title_text,
streams_text))
df = pd.DataFrame(list_of_data, columns =
['Rank','Artist','Title','Streams'])
print(df)
Basically, I'm trying to create a dataframe to hold 4 variables in each row for 200 rows for each date of spotify global charts. Please ignore some of the modules and libraries I've included at the top, they are used for iterating through each page of the historical data based on dynamic urls which I have already figured out. Any help is greatly appreciated! Thank you!
Before for loop I create list all_rows.
Inside for loop I add list with single row of data to all_rows.
After for loop I use all_rows to create DataFrame
import requests
from bs4 import BeautifulSoup
import pandas as pd
base_url = 'https://spotifycharts.com/regional/global/daily/'
r = requests.get(base_url)
soup = BeautifulSoup(r.text, 'html.parser')
chart = soup.find('table', {'class': 'chart-table'})
tbody = chart.find('tbody')
all_rows = []
for tr in tbody.find_all('tr'):
rank_text = tr.find('td', {'class': 'chart-table-position'}).text
artist_text = tr.find('td', {'class': 'chart-table-track'}).find('span').text
artist_text = artist_text.replace('by ','').strip()
title_text = tr.find('td', {'class': 'chart-table-track'}).find('strong').text
streams_text = tr.find('td', {'class': 'chart-table-streams'}).text
all_rows.append( [rank_text, artist_text, title_text, streams_text] )
# after `for` loop
df = pd.DataFrame(all_rows, columns=['Rank','Artist','Title','Streams'])
print(df.head())
You could use pandas and requests
import pandas as pd
import requests
headers = {'User-Agent': 'Mozilla/5.0'}
url ='https://spotifycharts.com/regional/global/daily/'
r = requests.get(url, headers = headers).content
table = pd.read_html(r)[0] #transfer html to pandas
table.dropna(axis = 1, how = 'all', inplace = True) #drop nan column
table[['Title','Artist']] = table['Unnamed: 3'].str.split(' by ',expand=True) #split title artist strings into two columns
del table['Unnamed: 3'] #remove combined column
table = table[['Track', 'Artist','Title', 'Unnamed: 4']] #re-order cols
table.columns= ['Rank', 'Artist','Title', 'Streams'] #rename cols
print(table)

Unable to write data into Excel file (multiple tabs) using Python

I am not much familiar with data writing in Excel format using Python, need some help to write my data output into single .xlsx (Excel) file with multiple tabs.
My code is given here:
import time
import requests
import random
from lxml import html
from bs4 import BeautifulSoup
import xlsxwriter
def write_to_file(file, mode, data, newline=None, with_tab=None):
with open(file, mode, encoding='utf-8') as l:
if with_tab == True:
data = '\t'.join(data)
if newline == True:
data = data+'\n'
l.write(data)
link = ["http://ec.europa.eu/environment/ets/ohaDetails.do?returnURL=&languageCode=en&accountID=&registryCode=&buttonAction=all&action=&account.registryCode=&accountType=&identifierInReg=&accountHolder=&primaryAuthRep=&installationIdentifier=&installationName=&accountStatus=&permitIdentifier=&complianceStatus=&mainActivityType=-1&searchType=oha&resultList.currentPageNumber="+str(var)+"&nextList=Next%C2%A0%3E&selectedPeriods=" for var in range(17500)] # This will read the URL's line by line as per specific value of var.
start = 1
end = 20
for pagenum, links in enumerate(link[start:end]):
print(links)
r = requests.get(links)
time.sleep(random.randint(2,5))
soup = BeautifulSoup(r.content,"lxml")
# Table 2
for items in soup.find(id="tblAccountContactInfo").find_all("tr")[:]:
dataset = [item.get_text(strip=True) for item in items.find_all("td")[:]]
print(dataset)
write_to_file('Table3.tsv', 'a', dataset, with_tab=True, newline=True)
write_to_file('Table3.tsv', 'a', links)
# Table 3
for items in soup.find(id="tblChildDetails").find("table").find_all("tr"):
dataset = [item.get_text(strip=True) for item in items.find_all("td")[:]]
print(dataset)
write_to_file('Table3.tsv', 'a', dataset, with_tab=True, newline=True)
write_to_file('Table3.tsv', 'a', links)
#workbook = xlsxwriter.Workbook('Table3.xlsx')
#worksheet = workbook.add_worksheet("Table 3")
#worksheet.write(dataset)
#workbook.close()
I need the output in .xlsx Excel sheet in multiple tabs like Table 1 tab and Table 2 tab, currently I am fetching data in .tsv format. I have tried the xlsxwriter but unable to get results so commented those line. Please help
You need to first create two worksheets, and keep track of the current row to be used for each worksheet. An append_row() function can then add one row of data to the required sheet.
import time
import requests
import random
from lxml import html
from bs4 import BeautifulSoup
import xlsxwriter
def append_row(ws, row):
for col, value in enumerate(row):
ws.write_string(ws.cur_row, col, value)
ws.cur_row += 1
workbook = xlsxwriter.Workbook('output.xlsx')
ws_2 = workbook.add_worksheet("Table 2")
ws_3 = workbook.add_worksheet("Table 3")
# Keep a track of the row to use in each worksheet
ws_2.cur_row = 0
ws_3.cur_row = 0
start = 1
end = 3
link = "http://ec.europa.eu/environment/ets/ohaDetails.do?returnURL=&languageCode=en&accountID=&registryCode=&buttonAction=all&action=&account.registryCode=&accountType=&identifierInReg=&accountHolder=&primaryAuthRep=&installationIdentifier=&installationName=&accountStatus=&permitIdentifier=&complianceStatus=&mainActivityType=-1&searchType=oha&resultList.currentPageNumber={}&nextList=Next%C2%A0%3E&selectedPeriods="
for page_number in range(start, end):
print("Page {}".format(page_number))
url = link.format(page_number)
r = requests.get(url)
time.sleep(random.randint(2, 5))
soup = BeautifulSoup(r.content, "lxml")
# Table 2
for items in soup.find(id="tblAccountContactInfo").find_all("tr")[:]:
dataset = [item.get_text(strip=True) for item in items.find_all("td")[:]]
append_row(ws_2, [url] + dataset])
# Table 3
for items in soup.find(id="tblChildDetails").find("table").find_all("tr"):
dataset = [item.get_text(strip=True) for item in items.find_all("td")[:]]
append_row(ws_3, [url] + dataset])
workbook.close()

Categories