Extracting Several Tables to Excel From BeautifulSoup - python

I have extracted several tables from BeautifulSoup from different URL:s and I get them to print fine within python. However, when I try to extract it to excel, it doesn't work. The tables are stored in a list but I can't find a way to store them in excel.
I want every table in a different sheet but I can't print a list straight into excel and the tables.
I have a list of many url:s, but here is some of them to show how they look.
https://www.sec.gov/Archives/edgar/data/3197/000119312510083400/ddef14a.htm
https://www.sec.gov/Archives/edgar/data/3197/000119312511098071/ddef14a.htm
https://www.sec.gov/Archives/edgar/data/3197/000119312512157233/d293744ddef14a.htm
https://www.sec.gov/Archives/edgar/data/3197/000119312513152959/d469796ddef14a.htm
from bs4 import BeautifulSoup
import requests
import pandas as pd
import xlwt
xl = pd.ExcelFile(r'/path/to/file/with/links.xlsx')
link = xl.parse('Sheet1')
book = xlwt.Workbook()
list1 = []
for i in range(10,16):
try:
url = link['Link'][i]
html = requests.get(url).content
df_list = pd.read_html(html)
#I have matched up two keywords
soup = BeautifulSoup(html,'lxml')
table1 = soup.select_one('table:contains("Fees")')
table2 = soup.select_one('table:contains("Earned")')
if table1 == table2:
df = pd.read_html(str(table1))
list1.append(df)
#HERE BELOW IS WHERE THE PROBLEM IS
writer = pd.ExcelWriter('Tables_Fees_Earned.xlsx')
for counter in range(len(list1)):
sheet_name = 'Sheet%s' % counter
pd.Series(name = '').to_excel(writer, sheet_name=sheet_name)
for c in range(len(list1)):
list1[c].to_excel(writer,'Sheet%s' % counter)
writer.save()
else:
print(i)
The error is: AttributeError: 'list' object has no attribute 'to_excel'

Related

Finding <caption class="table-title">

so I have written a script to scrape tables from a website and saves these to an Excel sheet:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from pandas import ExcelWriter
import os.path
path = "C:...."
url= 'https://zoek.officielebekendmakingen.nl/kst-35570-2.html'
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')
tables_df = pd.read_html(url, attrs = {'class': 'kio2 portrait'})
tables = soup.find_all('table', class_="kio2 portrait")
titles = []
for table in tables:
print(table)
title = table.find_all("caption", class_="table-title")
titles.append(title)
titles = []
writer = pd.ExcelWriter('output.xlsx')
for i, df in enumerate(tables_df, 1):
df.to_excel(writer, index=True,sheet_name=f'sheetName_{i}')
writer.save()
Which works, but now I want to find all titles of these table so I can give each sheet this title. For example, the first table has the following text of which I am interested:
<table cellpadding="0" cellspacing="0" class="kio2 portrait" summary="Tabel 1.1 Budgettaire kerngegevens"><caption class="table-title">Tabel 1.1 Budgettaire kerngegevens</caption>
Now I want to scrape the part between <caption class="table-title"> and </caption>. Or, which is also a possibility, use the summary element. How can I achieve this? I have tried it within the code but I do not find anything yet.
Try:
import requests
import pandas as pd
from bs4 import BeautifulSoup
from pandas import ExcelWriter
url = "https://zoek.officielebekendmakingen.nl/kst-35570-2.html"
soup = BeautifulSoup(requests.get(url).text, "html.parser")
writer = pd.ExcelWriter("output.xlsx")
for i, table in enumerate(soup.find_all("table", class_="kio2 portrait"), 1):
df = pd.read_html(str(table))[0]
caption = table.get("summary", "").replace(":", "").strip()
# some tables doesn't contain summary, so make generic sheet name:
if not caption:
caption = f"table {i}"
df.to_excel(writer, sheet_name=caption)
writer.save()
This creates output.xlsx with 185 sheets (at least opening it in my Libreoffice):

how to save a new data into an exist csv with scraping

I'm doing real time scraping data with python in jupyter notebook and task scheduler for scraping every month, everything's going fine, the data is saved into a csv and sql server,but the problem is everytime the data updates,the csv file wont change to the new one.
here is my code
import pandas as pd
import requests
from bs4 import BeautifulSoup
url = "https://www.indexmundi.com/commodities/?commodity=potassium-chloride&months=300"
r = requests.get(url)
html = r.text
soup = BeautifulSoup(html)
table = soup.find('table', {"class": "tblData"})
rows = table.find_all('tr')
data = []
for row in rows[1:]:
cols = row.find_all('td')
cols = [ele.text.strip() for ele in cols]
data.append([ele for ele in cols if ele])
result = pd.DataFrame(data, columns=['month', 'price', 'change'])
result['month'] = pd.to_datetime(result["month"])
result.to_csv("kcl.csv", index=False, mode='w')
df = pd.read_csv("kcl.csv")
pd.set_option('display.max_rows', df.shape[0]+1)
print(df)
import pyodbc
from sqlalchemy import create_engine
server = 'MSHULHAN\SQLEXPRESS'
database = 'daming'
engine = create_engine('mssql+pyodbc://' + server + '/' + database + '?trusted_connection=yes&driver=ODBC+Driver+13+for+SQL+Server')
#engine = create_engine('mysql://root:#localhost/daming') # enter your password and database names here
col_names = ["month", "price", "change"]
df = pd.read_csv("kcl.csv",sep=',',quotechar='\'',encoding='utf8', names=col_names,skiprows = 1) # Replace Excel_file_name with your excel sheet name
df.to_sql('kcl',con=engine,index=False,if_exists='replace') # Replace Table_name with your sql table name
with sql server i can just do if exist='replace',but with csv it doesnt do anything.Please help me,thank you in advance!

How to scrape all values of AJAX search table with PYTHON?

I am trying to scrape the CPU Specs Database at TechPowerUp.
I have found the table updates using AJAX and created the following code:
import requests
from bs4 import BeautifulSoup
import csv
import string
cpus = []
base = 'https://www.techpowerup.com/cpu-specs/?ajaxsrch='
letters = list(string.ascii_lowercase)
letters.extend(range(0, 10))
for i in letters:
URL = base + str(i)
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')
with open('cpu2.csv', mode='a') as cpu_csv:
headers = ['name', 'family', 'socket', 'release']
writer = csv.DictWriter(cpu_csv, fieldnames=headers, lineterminator='\n')
for tr in soup.find_all('tr')[1:]:
tds = tr.find_all('td')
if tds[0].text.strip() not in cpus:
writer.writerow({'name': tds[0].text.strip(), 'family': tds[1].text.strip(), 'socket': tds[4].text.strip(), 'release': tds[8].text.strip()})
cpus.append(tds[0].text.strip())
else:
print("duplicate")
This code works in the fact it loops through A-Z then 0-9 and populates a csv file ignoring duplicates however I'm getting a logical error where I am only scraping ~600 results where there are over 2000 entries.
I believe this may be due to a limit on the returned items for each AJAX Search request so not all entries are discovered, is there a different approach to fetch all results?
Thanks
import pandas as pd
import string
items = string.digits + string.ascii_lowercase
def main(url):
data = []
for item in items:
print(f"{item}")
df = pd.read_html(url.format(item))[0]
df = df[["Name", "Codename", "Socket", "Released"]]
data.append(df)
data = pd.concat(data)
data.drop_duplicates(subset='Name', keep="first",inplace=True)
data.to_csv("data.csv", index=False)
main("https://www.techpowerup.com/cpu-specs/?ajaxsrch={}")
Total Output is 596 based on removing duplicates By column Name.
View Online
Sample of output:
The easiest way to get the table data using pandas.Get the data in DataFrame and import into csv.
Code:
import string
import pandas as pd
base = 'https://www.techpowerup.com/cpu-specs/?ajaxsrch='
letters = list(string.ascii_lowercase)
letters.extend(range(0, 10))
df=pd.DataFrame()
for i in letters:
URL = base + str(i)
df1=pd.read_html(URL)[0]
df = df.append(df1, ignore_index=True)
print(df[['Name','Codename','Socket','Released']]) #This will give you 1739 records
#If you want to delete duplicates use this
df.drop_duplicates(subset='Name', keep='first', inplace=True)
print(df[['Name','Codename','Socket','Released']]) #This will give you 595 records
#Import into Csv file
df[['Name','Codename','Socket','Released']].to_csv("cpu_csv.csv",index=False)

Saving multiple data frames from loop

I have been searching for a solution to my problem, but all answers I find uses print() at the end of the answer, and NOT saving the data frames as I would like to.
Below I have a (almost) functioning code that prints 3 seperate tables. How do I save these three tables in 3 seperate data frames with the names matches_october, matches_november and matches_december?
The last line in my code is not working as I want it to work. I hope it is clear what I would like the code to do (Saving a data frame at the end of each of the 3 rounds in the loop)
import pandas as pd
import requests
from bs4 import BeautifulSoup
base_url = 'https://www.basketball-reference.com/leagues/NBA_2019_games-'
valid_pages = ['october','november','december']
end = '.html'
for i in valid_pages:
url = '{}{}{}'.format(base_url, i, end)
res = requests.get(url)
soup = BeautifulSoup(res.content,'lxml')
table = soup.find_all('table')[0]
df = pd.read_html(str(table))
print(df)
matches + valid_pages = df[0]
You can case it, but that's not very robust (and it's rather ugly).
if i == 'october':
matches_october = pd.read_html(str(table))
if i == 'november':
# so on and so forth
A more elegant solution is to use a dictionary. Before the loop, declare matches = {}. Then, in each iteration:
matches[i] = pd.read_html(str(table))
Then you can access the October matches DataFrame via matches['october'].
You can't compose variable names using +, try using a dict instead:
import pandas as pd
import requests
from bs4 import BeautifulSoup
matches = {} # create an empty dict
base_url = 'https://www.basketball-reference.com/leagues/NBA_2019_games-'
valid_pages = ['october','november','december']
end = '.html'
for i in valid_pages:
url = '{}{}{}'.format(base_url, i, end)
res = requests.get(url)
soup = BeautifulSoup(res.content,'lxml')
table = soup.find_all('table')[0]
df = pd.read_html(str(table))
print(df)
matches[i] = df[0] # store it in the dict
Thanks guys. That worked! :)
import pandas as pd
import requests
from bs4 import BeautifulSoup
matches = {} # create an empty dict
base_url = 'https://www.basketball-reference.com/leagues/NBA_2019_games-'
valid_pages = ['october','november','december']
end = '.html'
for i in valid_pages:
url = '{}{}{}'.format(base_url, i, end)
res = requests.get(url)
soup = BeautifulSoup(res.content,'lxml')
table = soup.find_all('table')[0]
df = pd.read_html(str(table))
matches[i] = df[0] # store it in the dict
matches_october = matches['october']

Unable to write data into Excel file (multiple tabs) using Python

I am not much familiar with data writing in Excel format using Python, need some help to write my data output into single .xlsx (Excel) file with multiple tabs.
My code is given here:
import time
import requests
import random
from lxml import html
from bs4 import BeautifulSoup
import xlsxwriter
def write_to_file(file, mode, data, newline=None, with_tab=None):
with open(file, mode, encoding='utf-8') as l:
if with_tab == True:
data = '\t'.join(data)
if newline == True:
data = data+'\n'
l.write(data)
link = ["http://ec.europa.eu/environment/ets/ohaDetails.do?returnURL=&languageCode=en&accountID=&registryCode=&buttonAction=all&action=&account.registryCode=&accountType=&identifierInReg=&accountHolder=&primaryAuthRep=&installationIdentifier=&installationName=&accountStatus=&permitIdentifier=&complianceStatus=&mainActivityType=-1&searchType=oha&resultList.currentPageNumber="+str(var)+"&nextList=Next%C2%A0%3E&selectedPeriods=" for var in range(17500)] # This will read the URL's line by line as per specific value of var.
start = 1
end = 20
for pagenum, links in enumerate(link[start:end]):
print(links)
r = requests.get(links)
time.sleep(random.randint(2,5))
soup = BeautifulSoup(r.content,"lxml")
# Table 2
for items in soup.find(id="tblAccountContactInfo").find_all("tr")[:]:
dataset = [item.get_text(strip=True) for item in items.find_all("td")[:]]
print(dataset)
write_to_file('Table3.tsv', 'a', dataset, with_tab=True, newline=True)
write_to_file('Table3.tsv', 'a', links)
# Table 3
for items in soup.find(id="tblChildDetails").find("table").find_all("tr"):
dataset = [item.get_text(strip=True) for item in items.find_all("td")[:]]
print(dataset)
write_to_file('Table3.tsv', 'a', dataset, with_tab=True, newline=True)
write_to_file('Table3.tsv', 'a', links)
#workbook = xlsxwriter.Workbook('Table3.xlsx')
#worksheet = workbook.add_worksheet("Table 3")
#worksheet.write(dataset)
#workbook.close()
I need the output in .xlsx Excel sheet in multiple tabs like Table 1 tab and Table 2 tab, currently I am fetching data in .tsv format. I have tried the xlsxwriter but unable to get results so commented those line. Please help
You need to first create two worksheets, and keep track of the current row to be used for each worksheet. An append_row() function can then add one row of data to the required sheet.
import time
import requests
import random
from lxml import html
from bs4 import BeautifulSoup
import xlsxwriter
def append_row(ws, row):
for col, value in enumerate(row):
ws.write_string(ws.cur_row, col, value)
ws.cur_row += 1
workbook = xlsxwriter.Workbook('output.xlsx')
ws_2 = workbook.add_worksheet("Table 2")
ws_3 = workbook.add_worksheet("Table 3")
# Keep a track of the row to use in each worksheet
ws_2.cur_row = 0
ws_3.cur_row = 0
start = 1
end = 3
link = "http://ec.europa.eu/environment/ets/ohaDetails.do?returnURL=&languageCode=en&accountID=&registryCode=&buttonAction=all&action=&account.registryCode=&accountType=&identifierInReg=&accountHolder=&primaryAuthRep=&installationIdentifier=&installationName=&accountStatus=&permitIdentifier=&complianceStatus=&mainActivityType=-1&searchType=oha&resultList.currentPageNumber={}&nextList=Next%C2%A0%3E&selectedPeriods="
for page_number in range(start, end):
print("Page {}".format(page_number))
url = link.format(page_number)
r = requests.get(url)
time.sleep(random.randint(2, 5))
soup = BeautifulSoup(r.content, "lxml")
# Table 2
for items in soup.find(id="tblAccountContactInfo").find_all("tr")[:]:
dataset = [item.get_text(strip=True) for item in items.find_all("td")[:]]
append_row(ws_2, [url] + dataset])
# Table 3
for items in soup.find(id="tblChildDetails").find("table").find_all("tr"):
dataset = [item.get_text(strip=True) for item in items.find_all("td")[:]]
append_row(ws_3, [url] + dataset])
workbook.close()

Categories