scraping select all checkbox using python - python

import requests
from bs4 import BeautifulSoup
import pandas as pd
url = "https://www.pivottrading.co.in/beta/tools/open-high-low-scanner.php?broker=zerodha"
page = requests.get(url)
soup = BeautifulSoup(page.text, 'lxml')
table = soup.find('table', {'class' : 'table'})
rows = table.find_all('th')
headers = []
for i in table.find_all('th'):
title = i.text
headers.append(title)
df = pd.DataFrame(columns = headers)
for row in table.find_all('tr')[1:]:
data = row.find_all('td')
row_data = [td.text.strip() for td in data]
length = len(df)
df.loc[length] = row_data
print(df)
I need to scrape a table from a website but it has select all checkbox for each row .What should I do.
Any help will be appreciated thank you.

(If I understand your question correctly: you want to remove the checkboxes from the output of the table).
Since the checkboxes are the first index of the table, you can skip them using index slicing. Use: [1:], which means: "from the first index to the last" (skipping the zero-based index).
import requests
from bs4 import BeautifulSoup
import pandas as pd
url = (
"https://www.pivottrading.co.in/beta/tools/open-high-low-scanner.php?broker=zerodha"
)
page = requests.get(url)
soup = BeautifulSoup(page.text, "lxml")
table = soup.find("table", {"class": "table"})
rows = table.find_all("th")
headers = []
for i in table.find_all("th"):
title = i.text.strip()
headers.append(title)
rows = []
for row in table.find_all("tr")[1:]:
data = row.find_all("td")
rows.append(td.text.strip() for td in data[1:])
df = pd.DataFrame(rows[:-1], columns=headers[1:])
print(df)
Output:
Scrip P.Close Open High Low LTP # REAL LTP(NOW) Result
0 BRITANNIA 3379.10 3385.00 3447.00 3385.00 3439.50 3439.50 0
1 EICHERMOT 2551.20 2565.00 2634.00 2565.00 2625.05 2625.05 0

You don't need to check those boxes in order to return all rows.
You can grab the table with pandas and drop the first column by name (if desired).
You can also do some tidying to match the web page.
import pandas as pd
df = pd.read_html('https://www.pivottrading.co.in/beta/tools/open-high-low-scanner.php?broker=zerodha')[0]
df.drop(columns={'Sr.No.'}, inplace=True)
df.iloc[-1, 0:4] = ''
df.fillna(0, inplace=True)
df

Related

i have tried to scrape a table using beautifulsoup and only one line of the table appears as output

I have tried to scrape the table http://www.geonames.org/search.html?q=kwadukuza&country=ZA, however only the last line of the table appears
import requests
from bs4 import BeautifulSoup
import pandas as pd
url = 'http://www.geonames.org/search.html?q=kwadukuza&country=ZA'
requests.get(url)
page = requests.get(url)
soup = BeautifulSoup(page.text, 'lxml')
table_data = soup.find('table', class_ = "restable")
headers = []
for i in table_data.find_all('th'):
title = i.text.strip()
headers.append(title)=
df = pd.DataFrame(columns = headers)
for j in table_data.find_all('tr', class_='odd'):
row_data = j.find_all('td')
row = [tr.text.strip() for tr in row_data]
you can use seperate list to append row data to make list of list data and then use it as row for your df
all_rows=[]
for j in table_data.find_all('tr',class_="odd"):
row_data = j.find_all('td')
row = [tr.text.strip() for tr in row_data]
all_rows.append(row)
For DataFrame:
df = pd.DataFrame(columns = headers,data=all_rows)
Output:
df.shape
(25,6)
As the comment already says, you need to put the row = [tr.text.strip() for tr in row_data] in the for loop. Otherwise you would just get the last entry.
In order to add the rows to the DataFrame, you need to make a list of all rows and put it together with the headers to a DataFrame. You could also append the rows to the DataFrame, but it is less efficient
Solution
import requests
from bs4 import BeautifulSoup
import pandas as pd
url = 'http://www.geonames.org/search.html?q=kwadukuza&country=ZA'
requests.get(url)
page = requests.get(url)
soup = BeautifulSoup(page.text, 'lxml')
table_data = soup.find('table', class_ = "restable")
headers = []
for i in table_data.find_all('th'):
title = i.text.strip()
headers.append(title)
data = []
for j in table_data.find_all('tr', class_='odd'):
row_data = j.find_all('td')
row = [tr.text.strip() for tr in row_data] # Put into the for loop
data.append(row)
# DataFrame
df = pd.DataFrame(columns=headers, data=data)
print(df)

Scraping table (several pages) to Pandas Dataframe

I'm trying to transfer the data of a long table (24 pages) to a Pandas Dataframe, but facing some issues with (i think) the for-loop code.
import requests
from bs4 import BeautifulSoup
import pandas as pd
base_url = 'https://scrapethissite.com/pages/forms/?page_num={}'
res = requests.get(base_url.format('1'))
soup = BeautifulSoup(res.text, 'lxml')
table = soup.select('table.table')[0]
columns = table.find('tr').find_all('th')
columns_names = [str(c.get_text()).strip() for c in columns]
table_rows = table.find_all('tr', class_='team')
l = []
for n in range(1, 25):
scrape_url = base_url.format(n)
res = requests.get(scrape_url)
soup = BeautifulSoup(res.text, 'lxml')
for tr in table_rows:
td = tr.find_all('td')
row = [str(tr.get_text()).strip() for tr in td]
l.append(row)
df = pd.DataFrame(l, columns=columns_names)
The Dataframe comes out as a repetition of the first page only, rather than a copy of all the data in the table.
I agree with #mxbi.
Try it:
import requests
from bs4 import BeautifulSoup
import pandas as pd
base_url = 'https://scrapethissite.com/pages/forms/?page_num={}'
l = []
for n in range(1, 25):
scrape_url = base_url.format(n)
res = requests.get(scrape_url)
soup = BeautifulSoup(res.text, 'lxml')
table = soup.select('table.table')[0]
columns = table.find('tr').find_all('th')
columns_names = [str(c.get_text()).strip() for c in columns]
table_rows = table.find_all('tr', class_='team')
for tr in table_rows:
td = tr.find_all('td')
row = [str(tr.get_text()).strip() for tr in td]
l.append(row)
df = pd.DataFrame(l, columns=columns_names)
requests is needed because the server wants an user-agent header and pandas read_html doesn't allow for that. As you still want to use pandas to generate the dataframe you could gain some efficiency through using multiprocessing to handle the requests, and within an user defined function extract the table of interest and pass as string to read_html. You will get a list of dataframes which can be combined with pandas concat.
Note: This can't be run from within Jupyter as will block.
import pandas as pd
from multiprocessing import Pool, cpu_count
import requests
from bs4 import BeautifulSoup as bs
def get_table(url:str)-> pd.DataFrame:
soup = bs(requests.get(url).text, 'lxml')
df = pd.read_html(str(soup.select_one('.table')))[0]
df['page_num'] = url.split("=")[-1]
return df
if __name__ == '__main__':
urls = [f'https://scrapethissite.com/pages/forms/?page_num={i}' for i in range(1, 25)]
with Pool(cpu_count()-1) as p:
results = p.map(get_table, urls)
final = pd.concat(results)
print(final)
# final.to_csv('data.csv', index = False, encoding = 'utf-8-sig')

I want to scrape Data from HTML and convert it into pandas DataDrame finally store it as aCSV file

import sys,csv,os
import pandas as pd
from bs4 import BeautifulSoup
import requests
from lxml import html
#url = r'https://agmarknet.gov.in/SearchCmmMkt.aspx?Tx_Commodity=137&Tx_State=0&Tx_District=0&Tx_Market=0&DateFrom=01-jan-2016&DateTo=19-nov-2019&Fr_Date=01-jan-2016&To_Date=19-nov-2019&Tx_Trend=2&Tx_CommodityHead=Ajwan&Tx_StateHead=--Select--&Tx_DistrictHead=--Select--&Tx_MarketHead=--Select--'
Export_Path = r"E:\Knoema_Work_Dataset"
Res = requests.get(url)
Soup = BeautifulSoup(Res.content,'lxml')
#print(Soup.prettify())
mylists = ['137','281','325','166','86','130']
for mylist in mylists:
url = 'https://agmarknet.gov.in/SearchCmmMkt.aspx?Tx_Commodity='+mylist+'+&Tx_State=0&Tx_District=0&Tx_Market=0&DateFrom=01-jan-2016&DateTo=19-nov-2019&Fr_Date=01-jan-2016&To_Date=19-nov-2019&Tx_Trend=2&Tx_CommodityHead=Ajwan&Tx_StateHead=--Select--&Tx_DistrictHead=--Select--&Tx_MarketHead=--Select--'+ mylist
soup = BeautifulSoup(Res.content,'lxml')
table = soup.find('table', {'class':'tableagmark_new'})
DataAll = pd.DataFrame(columns = ['State Name','District Name','Market Name','Variety','Group','Arrivals (Tonnes)','Min Price (Rs./Quintal)','Max Price (Rs./Quintal)','Modal Price (Rs./Quintal)','Reported Date'],dtype = object,index=range(0,1000))
row_marker = 0
for row in table.find_all('tr'):
column_marker = 0
columns = row.findAll('td')
for column in columns:
DataAll.iat[row_marker,column_marker] = column.get_text()
column_marker += 1
DataAll
Export_Path_F = os.path.join(Export_Path, 'aggr.csv')
DataAll.to_csv(Export_Path_F, encoding='utf-8-sig', index=False)
I am getting only the last row in a table in the dataframe 'DataAll'
i need full table to be plotted on the Dataframe
I made iterations to scrape data from multiple table to a single dataframe
please help me so that i can get all the contents in dataframe
Url = https://agmarknet.gov.in/SearchCmmMkt.aspx?Tx_Commodity=137&Tx_State=0&Tx_District=0&Tx_Market=0&DateFrom=01-jan-2016&DateTo=19-nov-2019&Fr_Date=01-jan-2016&To_Date=19-nov-2019&Tx_Trend=2&Tx_CommodityHead=Ajwan&Tx_StateHead=--Select--&Tx_DistrictHead=--Select--&Tx_MarketHead=--Select--

Web-Scraping Python, Indexing Issue for DataFrame

I'm working on a web-scraper for Spotify Charts to extract the top 200 daily songs each day. I have done everything to extract the data I'm interested in including rank, artist, track title, and stream numbers. What I'm stuck on is putting everything into a DataFrame to export as a CSV to excel. Right now when I print my DataFrame, it is treating each cycle as 1 row with 4 columns as opposed to 200 rows with 4 columns.
I'm not sure what the issue is as I've tried just about everything and looked into it as much as I could. I know something is wrong with the indexing because each "what should be a row" has the same first "0" index, when they should go sequential to 199. Also, the column names for my DataFrame keep repeating after each "what should be a row", so I know there is definitely an issue there.
import requests
from bs4 import BeautifulSoup
from datetime import date, timedelta
from time import time
from time import sleep
from random import randint
import pandas as pd
import numpy as np
base_url = 'https://spotifycharts.com/regional/global/daily/'
r = requests.get(base_url)
soup = BeautifulSoup(r.text, 'html.parser')
chart = soup.find('table', {'class': 'chart-table'})
tbody = chart.find('tbody')
for tr in tbody.find_all('tr'):
rank_text = []
rank_text_elem = tr.find('td', {'class': 'chart-table-
position'})
for item in rank_text_elem:
rank_text = []
rank_text.append(item)
artist_text = []
artist_text_elem = tr.find('td', {'class': 'chart-table-
track'}).find_all('span')
for item in artist_text_elem:
artist_text = []
artist_text.append(item.text.replace('by ','').strip())
title_text = []
title_text_elem = tr.find('td', {'class': 'chart-table-
track'}).find_all('strong')
for item in title_text_elem:
title_text = []
title_text.append(item.text)
streams_text = []
streams_text_elem = tr.find('td', {'class': 'chart-table-streams'})
for item in streams_text_elem:
streams_text = []
streams_text.append(item)
# creating dataframe to store 4 variables
list_of_data = list(zip(rank_text, artist_text, title_text,
streams_text))
df = pd.DataFrame(list_of_data, columns =
['Rank','Artist','Title','Streams'])
print(df)
Basically, I'm trying to create a dataframe to hold 4 variables in each row for 200 rows for each date of spotify global charts. Please ignore some of the modules and libraries I've included at the top, they are used for iterating through each page of the historical data based on dynamic urls which I have already figured out. Any help is greatly appreciated! Thank you!
Before for loop I create list all_rows.
Inside for loop I add list with single row of data to all_rows.
After for loop I use all_rows to create DataFrame
import requests
from bs4 import BeautifulSoup
import pandas as pd
base_url = 'https://spotifycharts.com/regional/global/daily/'
r = requests.get(base_url)
soup = BeautifulSoup(r.text, 'html.parser')
chart = soup.find('table', {'class': 'chart-table'})
tbody = chart.find('tbody')
all_rows = []
for tr in tbody.find_all('tr'):
rank_text = tr.find('td', {'class': 'chart-table-position'}).text
artist_text = tr.find('td', {'class': 'chart-table-track'}).find('span').text
artist_text = artist_text.replace('by ','').strip()
title_text = tr.find('td', {'class': 'chart-table-track'}).find('strong').text
streams_text = tr.find('td', {'class': 'chart-table-streams'}).text
all_rows.append( [rank_text, artist_text, title_text, streams_text] )
# after `for` loop
df = pd.DataFrame(all_rows, columns=['Rank','Artist','Title','Streams'])
print(df.head())
You could use pandas and requests
import pandas as pd
import requests
headers = {'User-Agent': 'Mozilla/5.0'}
url ='https://spotifycharts.com/regional/global/daily/'
r = requests.get(url, headers = headers).content
table = pd.read_html(r)[0] #transfer html to pandas
table.dropna(axis = 1, how = 'all', inplace = True) #drop nan column
table[['Title','Artist']] = table['Unnamed: 3'].str.split(' by ',expand=True) #split title artist strings into two columns
del table['Unnamed: 3'] #remove combined column
table = table[['Track', 'Artist','Title', 'Unnamed: 4']] #re-order cols
table.columns= ['Rank', 'Artist','Title', 'Streams'] #rename cols
print(table)

Extract both text in urls from table in page - beautiful soup

I am trying to extract both text and urls in a table from a website but I only seem to be able to get the text. I am guessing this has something to do with the
text.strip in my code but I am not how I can clean up the html tags without removing the url links in there. Here's what I've put together so far:
import requests
from bs4 import BeautifulSoup
start_number = 0
max_number = 5
urls=[]
for number in range(start_number, max_number + start_number):
url = 'http://www.ispo-org.or.id/index.php?option=com_content&view=article&id=79:pengumumanpublik&catid=10&Itemid=233&showall=&limitstart=' + str(number)+ '&lang=en'
urls.append(url)
data = []
for url in urls:
r = requests.get(url)
soup = BeautifulSoup(r.content,"html.parser")
table = soup.find("table")
table_body = table.find('tbody')
rows = table_body.find_all('tr')
for row in rows:
cols = row.find_all('td')
cols = [ele.text.strip() for ele in cols]
data.append([ele for ele in cols if ele]) # Get rid of empty values
Simply extract the href from the <a> element. For the purpose of the answer, I simplified the code not to worry about subsequent pages.
from collections import namedtuple
import requests
from bs4 import BeautifulSoup
url = 'http://www.ispo-org.or.id/index.php?option=com_content&view=article&id=79:pengumumanpublik&catid=10&Itemid=233&showall=&limitstart=0&lang=en'
data = []
Record = namedtuple('Record', 'id company agency date pdf_link')
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
rows = soup.select('table > tbody > tr')
for row in rows[1:]: # omit header row
cols = row.find_all('td')
fields = [td.text.strip() for td in cols if td.text.strip()]
if fields: # if the row is not empty
pdf_link = row.find('a')['href']
record = Record(*fields, pdf_link)
data.append(record)
>>> data[0].pdf_link
'images/notifikasi/619.%20Pengumuman%20Publik%20PT%20IGP.compressed.pdf'

Categories