I'm doing real time scraping data with python in jupyter notebook and task scheduler for scraping every month, everything's going fine, the data is saved into a csv and sql server,but the problem is everytime the data updates,the csv file wont change to the new one.
here is my code
import pandas as pd
import requests
from bs4 import BeautifulSoup
url = "https://www.indexmundi.com/commodities/?commodity=potassium-chloride&months=300"
r = requests.get(url)
html = r.text
soup = BeautifulSoup(html)
table = soup.find('table', {"class": "tblData"})
rows = table.find_all('tr')
data = []
for row in rows[1:]:
cols = row.find_all('td')
cols = [ele.text.strip() for ele in cols]
data.append([ele for ele in cols if ele])
result = pd.DataFrame(data, columns=['month', 'price', 'change'])
result['month'] = pd.to_datetime(result["month"])
result.to_csv("kcl.csv", index=False, mode='w')
df = pd.read_csv("kcl.csv")
pd.set_option('display.max_rows', df.shape[0]+1)
print(df)
import pyodbc
from sqlalchemy import create_engine
server = 'MSHULHAN\SQLEXPRESS'
database = 'daming'
engine = create_engine('mssql+pyodbc://' + server + '/' + database + '?trusted_connection=yes&driver=ODBC+Driver+13+for+SQL+Server')
#engine = create_engine('mysql://root:#localhost/daming') # enter your password and database names here
col_names = ["month", "price", "change"]
df = pd.read_csv("kcl.csv",sep=',',quotechar='\'',encoding='utf8', names=col_names,skiprows = 1) # Replace Excel_file_name with your excel sheet name
df.to_sql('kcl',con=engine,index=False,if_exists='replace') # Replace Table_name with your sql table name
with sql server i can just do if exist='replace',but with csv it doesnt do anything.Please help me,thank you in advance!
Related
import requests
from bs4 import BeautifulSoup
import pandas as pd
url = "https://www.pivottrading.co.in/beta/tools/open-high-low-scanner.php?broker=zerodha"
page = requests.get(url)
soup = BeautifulSoup(page.text, 'lxml')
table = soup.find('table', {'class' : 'table'})
rows = table.find_all('th')
headers = []
for i in table.find_all('th'):
title = i.text
headers.append(title)
df = pd.DataFrame(columns = headers)
for row in table.find_all('tr')[1:]:
data = row.find_all('td')
row_data = [td.text.strip() for td in data]
length = len(df)
df.loc[length] = row_data
print(df)
I need to scrape a table from a website but it has select all checkbox for each row .What should I do.
Any help will be appreciated thank you.
(If I understand your question correctly: you want to remove the checkboxes from the output of the table).
Since the checkboxes are the first index of the table, you can skip them using index slicing. Use: [1:], which means: "from the first index to the last" (skipping the zero-based index).
import requests
from bs4 import BeautifulSoup
import pandas as pd
url = (
"https://www.pivottrading.co.in/beta/tools/open-high-low-scanner.php?broker=zerodha"
)
page = requests.get(url)
soup = BeautifulSoup(page.text, "lxml")
table = soup.find("table", {"class": "table"})
rows = table.find_all("th")
headers = []
for i in table.find_all("th"):
title = i.text.strip()
headers.append(title)
rows = []
for row in table.find_all("tr")[1:]:
data = row.find_all("td")
rows.append(td.text.strip() for td in data[1:])
df = pd.DataFrame(rows[:-1], columns=headers[1:])
print(df)
Output:
Scrip P.Close Open High Low LTP # REAL LTP(NOW) Result
0 BRITANNIA 3379.10 3385.00 3447.00 3385.00 3439.50 3439.50 0
1 EICHERMOT 2551.20 2565.00 2634.00 2565.00 2625.05 2625.05 0
You don't need to check those boxes in order to return all rows.
You can grab the table with pandas and drop the first column by name (if desired).
You can also do some tidying to match the web page.
import pandas as pd
df = pd.read_html('https://www.pivottrading.co.in/beta/tools/open-high-low-scanner.php?broker=zerodha')[0]
df.drop(columns={'Sr.No.'}, inplace=True)
df.iloc[-1, 0:4] = ''
df.fillna(0, inplace=True)
df
I'm trying to transfer the data of a long table (24 pages) to a Pandas Dataframe, but facing some issues with (i think) the for-loop code.
import requests
from bs4 import BeautifulSoup
import pandas as pd
base_url = 'https://scrapethissite.com/pages/forms/?page_num={}'
res = requests.get(base_url.format('1'))
soup = BeautifulSoup(res.text, 'lxml')
table = soup.select('table.table')[0]
columns = table.find('tr').find_all('th')
columns_names = [str(c.get_text()).strip() for c in columns]
table_rows = table.find_all('tr', class_='team')
l = []
for n in range(1, 25):
scrape_url = base_url.format(n)
res = requests.get(scrape_url)
soup = BeautifulSoup(res.text, 'lxml')
for tr in table_rows:
td = tr.find_all('td')
row = [str(tr.get_text()).strip() for tr in td]
l.append(row)
df = pd.DataFrame(l, columns=columns_names)
The Dataframe comes out as a repetition of the first page only, rather than a copy of all the data in the table.
I agree with #mxbi.
Try it:
import requests
from bs4 import BeautifulSoup
import pandas as pd
base_url = 'https://scrapethissite.com/pages/forms/?page_num={}'
l = []
for n in range(1, 25):
scrape_url = base_url.format(n)
res = requests.get(scrape_url)
soup = BeautifulSoup(res.text, 'lxml')
table = soup.select('table.table')[0]
columns = table.find('tr').find_all('th')
columns_names = [str(c.get_text()).strip() for c in columns]
table_rows = table.find_all('tr', class_='team')
for tr in table_rows:
td = tr.find_all('td')
row = [str(tr.get_text()).strip() for tr in td]
l.append(row)
df = pd.DataFrame(l, columns=columns_names)
requests is needed because the server wants an user-agent header and pandas read_html doesn't allow for that. As you still want to use pandas to generate the dataframe you could gain some efficiency through using multiprocessing to handle the requests, and within an user defined function extract the table of interest and pass as string to read_html. You will get a list of dataframes which can be combined with pandas concat.
Note: This can't be run from within Jupyter as will block.
import pandas as pd
from multiprocessing import Pool, cpu_count
import requests
from bs4 import BeautifulSoup as bs
def get_table(url:str)-> pd.DataFrame:
soup = bs(requests.get(url).text, 'lxml')
df = pd.read_html(str(soup.select_one('.table')))[0]
df['page_num'] = url.split("=")[-1]
return df
if __name__ == '__main__':
urls = [f'https://scrapethissite.com/pages/forms/?page_num={i}' for i in range(1, 25)]
with Pool(cpu_count()-1) as p:
results = p.map(get_table, urls)
final = pd.concat(results)
print(final)
# final.to_csv('data.csv', index = False, encoding = 'utf-8-sig')
I am trying to scrape the CPU Specs Database at TechPowerUp.
I have found the table updates using AJAX and created the following code:
import requests
from bs4 import BeautifulSoup
import csv
import string
cpus = []
base = 'https://www.techpowerup.com/cpu-specs/?ajaxsrch='
letters = list(string.ascii_lowercase)
letters.extend(range(0, 10))
for i in letters:
URL = base + str(i)
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')
with open('cpu2.csv', mode='a') as cpu_csv:
headers = ['name', 'family', 'socket', 'release']
writer = csv.DictWriter(cpu_csv, fieldnames=headers, lineterminator='\n')
for tr in soup.find_all('tr')[1:]:
tds = tr.find_all('td')
if tds[0].text.strip() not in cpus:
writer.writerow({'name': tds[0].text.strip(), 'family': tds[1].text.strip(), 'socket': tds[4].text.strip(), 'release': tds[8].text.strip()})
cpus.append(tds[0].text.strip())
else:
print("duplicate")
This code works in the fact it loops through A-Z then 0-9 and populates a csv file ignoring duplicates however I'm getting a logical error where I am only scraping ~600 results where there are over 2000 entries.
I believe this may be due to a limit on the returned items for each AJAX Search request so not all entries are discovered, is there a different approach to fetch all results?
Thanks
import pandas as pd
import string
items = string.digits + string.ascii_lowercase
def main(url):
data = []
for item in items:
print(f"{item}")
df = pd.read_html(url.format(item))[0]
df = df[["Name", "Codename", "Socket", "Released"]]
data.append(df)
data = pd.concat(data)
data.drop_duplicates(subset='Name', keep="first",inplace=True)
data.to_csv("data.csv", index=False)
main("https://www.techpowerup.com/cpu-specs/?ajaxsrch={}")
Total Output is 596 based on removing duplicates By column Name.
View Online
Sample of output:
The easiest way to get the table data using pandas.Get the data in DataFrame and import into csv.
Code:
import string
import pandas as pd
base = 'https://www.techpowerup.com/cpu-specs/?ajaxsrch='
letters = list(string.ascii_lowercase)
letters.extend(range(0, 10))
df=pd.DataFrame()
for i in letters:
URL = base + str(i)
df1=pd.read_html(URL)[0]
df = df.append(df1, ignore_index=True)
print(df[['Name','Codename','Socket','Released']]) #This will give you 1739 records
#If you want to delete duplicates use this
df.drop_duplicates(subset='Name', keep='first', inplace=True)
print(df[['Name','Codename','Socket','Released']]) #This will give you 595 records
#Import into Csv file
df[['Name','Codename','Socket','Released']].to_csv("cpu_csv.csv",index=False)
import sys,csv,os
import pandas as pd
from bs4 import BeautifulSoup
import requests
from lxml import html
#url = r'https://agmarknet.gov.in/SearchCmmMkt.aspx?Tx_Commodity=137&Tx_State=0&Tx_District=0&Tx_Market=0&DateFrom=01-jan-2016&DateTo=19-nov-2019&Fr_Date=01-jan-2016&To_Date=19-nov-2019&Tx_Trend=2&Tx_CommodityHead=Ajwan&Tx_StateHead=--Select--&Tx_DistrictHead=--Select--&Tx_MarketHead=--Select--'
Export_Path = r"E:\Knoema_Work_Dataset"
Res = requests.get(url)
Soup = BeautifulSoup(Res.content,'lxml')
#print(Soup.prettify())
mylists = ['137','281','325','166','86','130']
for mylist in mylists:
url = 'https://agmarknet.gov.in/SearchCmmMkt.aspx?Tx_Commodity='+mylist+'+&Tx_State=0&Tx_District=0&Tx_Market=0&DateFrom=01-jan-2016&DateTo=19-nov-2019&Fr_Date=01-jan-2016&To_Date=19-nov-2019&Tx_Trend=2&Tx_CommodityHead=Ajwan&Tx_StateHead=--Select--&Tx_DistrictHead=--Select--&Tx_MarketHead=--Select--'+ mylist
soup = BeautifulSoup(Res.content,'lxml')
table = soup.find('table', {'class':'tableagmark_new'})
DataAll = pd.DataFrame(columns = ['State Name','District Name','Market Name','Variety','Group','Arrivals (Tonnes)','Min Price (Rs./Quintal)','Max Price (Rs./Quintal)','Modal Price (Rs./Quintal)','Reported Date'],dtype = object,index=range(0,1000))
row_marker = 0
for row in table.find_all('tr'):
column_marker = 0
columns = row.findAll('td')
for column in columns:
DataAll.iat[row_marker,column_marker] = column.get_text()
column_marker += 1
DataAll
Export_Path_F = os.path.join(Export_Path, 'aggr.csv')
DataAll.to_csv(Export_Path_F, encoding='utf-8-sig', index=False)
I am getting only the last row in a table in the dataframe 'DataAll'
i need full table to be plotted on the Dataframe
I made iterations to scrape data from multiple table to a single dataframe
please help me so that i can get all the contents in dataframe
Url = https://agmarknet.gov.in/SearchCmmMkt.aspx?Tx_Commodity=137&Tx_State=0&Tx_District=0&Tx_Market=0&DateFrom=01-jan-2016&DateTo=19-nov-2019&Fr_Date=01-jan-2016&To_Date=19-nov-2019&Tx_Trend=2&Tx_CommodityHead=Ajwan&Tx_StateHead=--Select--&Tx_DistrictHead=--Select--&Tx_MarketHead=--Select--
I have extracted several tables from BeautifulSoup from different URL:s and I get them to print fine within python. However, when I try to extract it to excel, it doesn't work. The tables are stored in a list but I can't find a way to store them in excel.
I want every table in a different sheet but I can't print a list straight into excel and the tables.
I have a list of many url:s, but here is some of them to show how they look.
https://www.sec.gov/Archives/edgar/data/3197/000119312510083400/ddef14a.htm
https://www.sec.gov/Archives/edgar/data/3197/000119312511098071/ddef14a.htm
https://www.sec.gov/Archives/edgar/data/3197/000119312512157233/d293744ddef14a.htm
https://www.sec.gov/Archives/edgar/data/3197/000119312513152959/d469796ddef14a.htm
from bs4 import BeautifulSoup
import requests
import pandas as pd
import xlwt
xl = pd.ExcelFile(r'/path/to/file/with/links.xlsx')
link = xl.parse('Sheet1')
book = xlwt.Workbook()
list1 = []
for i in range(10,16):
try:
url = link['Link'][i]
html = requests.get(url).content
df_list = pd.read_html(html)
#I have matched up two keywords
soup = BeautifulSoup(html,'lxml')
table1 = soup.select_one('table:contains("Fees")')
table2 = soup.select_one('table:contains("Earned")')
if table1 == table2:
df = pd.read_html(str(table1))
list1.append(df)
#HERE BELOW IS WHERE THE PROBLEM IS
writer = pd.ExcelWriter('Tables_Fees_Earned.xlsx')
for counter in range(len(list1)):
sheet_name = 'Sheet%s' % counter
pd.Series(name = '').to_excel(writer, sheet_name=sheet_name)
for c in range(len(list1)):
list1[c].to_excel(writer,'Sheet%s' % counter)
writer.save()
else:
print(i)
The error is: AttributeError: 'list' object has no attribute 'to_excel'