Scrape Historical Bitcoin Data from Coinmarketcap with BeautifulSoup - python

I'm trying to scrape Historical Bitcoin Data from coinmarketcap.com in order to get close, volume, date, high and low values since the beginning of the year until Sep 30, 2021. After going through threads and videos for hours, and I'm new to scraping with Python, I don't know what my mistake is (or is there something with the website I don't detect?). The following is my code:
from bs4 import BeautifulSoup
import requests
import pandas as pd
closeList = []
volumeList = []
dateList = []
highList = []
lowList = []
website = 'https://coinmarketcap.com/currencies/bitcoin/historical-data/'
r = requests.get(website)
r = requests.get(website)
soup = BeautifulSoup(r.text, 'lxml')
tr = soup.find_all('tr')
FullData = []
for item in tr:
closeList.append(item.find_all('td')[4].text)
volumeList.append(item.find_all('td')[5].text)
dateList.append(item.find('td',{'style':'text-align: left;'}).text)
highList.append(item.find_all('td')[2].text)
lowList.append(item.find_all('td')[3].text)
FullData.append([closeList,volumeList,dateList,highList,lowList])
df_columns = ["close", "volume", "date", "high", "low"]
df = pd.DataFrame(FullData, columns = df_columns)
print(df)
As a result I only get:
Empty DataFrame
Columns: [close, volume, date, high, low]
Index: []
The task obliges me to scrape with BeautifulSoup and then export to csv (which obviously then is simply df.to_csv - can somebody help me out? That would be highly appreciated.

Actually, data is loaded dynamically by javascript from api calls json response. So you can grab data easily as follows:
Code:
import requests
import json
import pandas as pd
api_url= 'https://api.coinmarketcap.com/data-api/v3/cryptocurrency/historical?id=1&convertId=2781&timeStart=1632441600&timeEnd=1637712000'
r = requests.get(api_url)
data = []
for item in r.json()['data']['quotes']:
close = item['quote']['close']
volume =item['quote']['volume']
date=item['quote']['timestamp']
high=item['quote']['high']
low=item['quote']['low']
data.append([close,volume,date,high,low])
cols = ["close", "volume","date","high","low"]
df = pd.DataFrame(data, columns= cols)
print(df)
#df.to_csv('info.csv',index = False)
Output:
close volume date high low
0 42839.751696 4.283935e+10 2021-09-24T23:59:59.999Z 45080.491063 40936.557169
1 42716.593147 3.160472e+10 2021-09-25T23:59:59.999Z 42996.259704 41759.920425
2 43208.539105 3.066122e+10 2021-09-26T23:59:59.999Z 43919.300970 40848.461660
3 42235.731847 3.098003e+10 2021-09-27T23:59:59.999Z 44313.245882 42190.632576
4 41034.544665 3.021494e+10 2021-09-28T23:59:59.999Z 42775.146142 40931.662500
.. ... ... ... ... ...
56 58119.576194 3.870241e+10 2021-11-19T23:59:59.999Z 58351.113266 55705.180685
57 59697.197134 3.062426e+10 2021-11-20T23:59:59.999Z 59859.880442 57469.725661
58 58730.476639 2.612345e+10 2021-11-21T23:59:59.999Z 60004.426383 58618.931432
59 56289.287323 3.503612e+10 2021-11-22T23:59:59.999Z 59266.358468 55679.840404
60 57569.074876 3.748580e+10 2021-11-23T23:59:59.999Z 57875.516397 55632.759912
[61 rows x 5 columns]

Related

scraping select all checkbox using python

import requests
from bs4 import BeautifulSoup
import pandas as pd
url = "https://www.pivottrading.co.in/beta/tools/open-high-low-scanner.php?broker=zerodha"
page = requests.get(url)
soup = BeautifulSoup(page.text, 'lxml')
table = soup.find('table', {'class' : 'table'})
rows = table.find_all('th')
headers = []
for i in table.find_all('th'):
title = i.text
headers.append(title)
df = pd.DataFrame(columns = headers)
for row in table.find_all('tr')[1:]:
data = row.find_all('td')
row_data = [td.text.strip() for td in data]
length = len(df)
df.loc[length] = row_data
print(df)
I need to scrape a table from a website but it has select all checkbox for each row .What should I do.
Any help will be appreciated thank you.
(If I understand your question correctly: you want to remove the checkboxes from the output of the table).
Since the checkboxes are the first index of the table, you can skip them using index slicing. Use: [1:], which means: "from the first index to the last" (skipping the zero-based index).
import requests
from bs4 import BeautifulSoup
import pandas as pd
url = (
"https://www.pivottrading.co.in/beta/tools/open-high-low-scanner.php?broker=zerodha"
)
page = requests.get(url)
soup = BeautifulSoup(page.text, "lxml")
table = soup.find("table", {"class": "table"})
rows = table.find_all("th")
headers = []
for i in table.find_all("th"):
title = i.text.strip()
headers.append(title)
rows = []
for row in table.find_all("tr")[1:]:
data = row.find_all("td")
rows.append(td.text.strip() for td in data[1:])
df = pd.DataFrame(rows[:-1], columns=headers[1:])
print(df)
Output:
Scrip P.Close Open High Low LTP # REAL LTP(NOW) Result
0 BRITANNIA 3379.10 3385.00 3447.00 3385.00 3439.50 3439.50 0
1 EICHERMOT 2551.20 2565.00 2634.00 2565.00 2625.05 2625.05 0
You don't need to check those boxes in order to return all rows.
You can grab the table with pandas and drop the first column by name (if desired).
You can also do some tidying to match the web page.
import pandas as pd
df = pd.read_html('https://www.pivottrading.co.in/beta/tools/open-high-low-scanner.php?broker=zerodha')[0]
df.drop(columns={'Sr.No.'}, inplace=True)
df.iloc[-1, 0:4] = ''
df.fillna(0, inplace=True)
df

Always making a new DataFrame

I followed a tutorial but I didn't like the result, so I am trying to optimise it but I can't seem to find a way around always making a new dataframe. And I know it is a result from the while loop.
So what i want is the price to append to the dataframe i made.
Thanks in advance!
import pandas as pd
import bs4
import requests
from bs4 import BeautifulSoup
import datetime
#getting actual price
def Real_time_Price(stock):
url = ('https://finance.yahoo.com/quote/'+stock+'?p='+stock)
r=requests.get(url)
web_content=BeautifulSoup(r.text, 'lxml')
web_content = web_content.find('div',{'class':"My(6px) Pos(r) smartphone_Mt(6px)"})
web_content = web_content.find('span').text
return web_content
and here is where my problem starts
while True:
price = []
col = []
time_stamp = datetime.datetime.now()
#de milli seconden wegknallen.
time_stamp = time_stamp.strftime("%Y-%m-%d %H:%M:%S")
#welke stocks wilje checken
ticker_symbols = ['TSLA','AAPL','MSFT']
for stock in ticker_symbols:
price.append(Real_time_Price(stock))
#getting it into a pandas dataframe
#You want [data] for pandas to understand they're rows.
df=pd.DataFrame(data=[price], index=[time_stamp], columns=ticker_symbols)
print(df)
Create dataframe once and use DataFrame.loc([]) to append
df=pd.DataFrame(index=[time_stamp], columns=ticker_symbols)
while True:
price = []
col = []
time_stamp = datetime.datetime.now()
#de milli seconden wegknallen.
time_stamp = time_stamp.strftime("%Y-%m-%d %H:%M:%S")
#welke stocks wilje checken
ticker_symbols = ['TSLA','AAPL','MSFT']
for stock in ticker_symbols:
price.append(Real_time_Price(stock))
df.loc[time_stamp]=price

BeautifulSoup: Scraping CSV list of URLs

I have been trying to download data from different urls and then save it to a csv file.
The idea is extract the highlighted data from: https://www.marketwatch.com/investing/stock/MMM/financials/cash-flow
So far I built the following piece of code:
import pandas as pd
from bs4 import BeautifulSoup
import urllib.request as ur
url_is = 'https://www.marketwatch.com/investing/stock/MMM/financials/cash-flow'
read_data = ur.urlopen(url_is).read()
soup_is=BeautifulSoup(read_data, 'lxml')
row = soup_is.select_one('tr.mainRow>td.rowTitle:contains("Cash Dividends Paid - Total")')
data=[cell.text for cell in row.parent.select('td') if cell.text!='']
df=pd.DataFrame(data)
print(df.T)
I get as an output:
All good so far.
Now my idea is to extract specific classes from multiple URLs, keep the same headers from the website and export it to a .csv.
The tags and classes stay the same
Sample URLs:
https://www.marketwatch.com/investing/stock/MMM/financials/cash-flow
https://www.marketwatch.com/investing/stock/aapl/financials/cash-flow
Code (I wanted to try with 2 columns: 2015 and 2016)
As desidered ouput I would like something like:
I wrote the following code, but is giving me issues, any help or advice is welcome:
import pandas as pd
from bs4 import BeautifulSoup
import urllib.request as ur
import numpy as np
import requests
links = ['https://www.marketwatch.com/investing/stock/aapl/financials/cash-flow', 'https://www.marketwatch.com/investing/stock/MMM/financials/cash-flow']
container = pd.DataFrame(columns=['Name', 'Name2'])
pos=0
for l in links:
read_data = ur.urlopen(l).read()
soup_is=BeautifulSoup(read_data, 'lxml')
row = soup_is.select_one('tr.mainRow>td.rowTitle:contains("Cash Dividends Paid - Total")')
results=[cell.text for cell in row.parent.select('td') if cell.text!='']
records = []
for result in results:
records = []
Name = result.find('span', attrs={'itemprop':'2015'}).text if result.find('span', attrs={'itemprop':'2015'}) is not None else ''
Name2 = result.find('span', attrs={'itemprop':'2016'}).text if result.find('span', attrs={'itemprop':'2016'}) is not None else ''
records.append(Name)
records.append(Name2)
container.loc[pos] = records
pos+=1
import requests
import pandas as pd
urls = ['https://www.marketwatch.com/investing/stock/aapl/financials/cash-flow',
'https://www.marketwatch.com/investing/stock/MMM/financials/cash-flow']
def main(urls):
with requests.Session() as req:
goal = []
for url in urls:
r = req.get(url)
df = pd.read_html(
r.content, match="Cash Dividends Paid - Total")[0].iloc[[0], 0:3]
goal.append(df)
new = pd.concat(goal)
print(new)
main(urls)

I want to scrape Data from HTML and convert it into pandas DataDrame finally store it as aCSV file

import sys,csv,os
import pandas as pd
from bs4 import BeautifulSoup
import requests
from lxml import html
#url = r'https://agmarknet.gov.in/SearchCmmMkt.aspx?Tx_Commodity=137&Tx_State=0&Tx_District=0&Tx_Market=0&DateFrom=01-jan-2016&DateTo=19-nov-2019&Fr_Date=01-jan-2016&To_Date=19-nov-2019&Tx_Trend=2&Tx_CommodityHead=Ajwan&Tx_StateHead=--Select--&Tx_DistrictHead=--Select--&Tx_MarketHead=--Select--'
Export_Path = r"E:\Knoema_Work_Dataset"
Res = requests.get(url)
Soup = BeautifulSoup(Res.content,'lxml')
#print(Soup.prettify())
mylists = ['137','281','325','166','86','130']
for mylist in mylists:
url = 'https://agmarknet.gov.in/SearchCmmMkt.aspx?Tx_Commodity='+mylist+'+&Tx_State=0&Tx_District=0&Tx_Market=0&DateFrom=01-jan-2016&DateTo=19-nov-2019&Fr_Date=01-jan-2016&To_Date=19-nov-2019&Tx_Trend=2&Tx_CommodityHead=Ajwan&Tx_StateHead=--Select--&Tx_DistrictHead=--Select--&Tx_MarketHead=--Select--'+ mylist
soup = BeautifulSoup(Res.content,'lxml')
table = soup.find('table', {'class':'tableagmark_new'})
DataAll = pd.DataFrame(columns = ['State Name','District Name','Market Name','Variety','Group','Arrivals (Tonnes)','Min Price (Rs./Quintal)','Max Price (Rs./Quintal)','Modal Price (Rs./Quintal)','Reported Date'],dtype = object,index=range(0,1000))
row_marker = 0
for row in table.find_all('tr'):
column_marker = 0
columns = row.findAll('td')
for column in columns:
DataAll.iat[row_marker,column_marker] = column.get_text()
column_marker += 1
DataAll
Export_Path_F = os.path.join(Export_Path, 'aggr.csv')
DataAll.to_csv(Export_Path_F, encoding='utf-8-sig', index=False)
I am getting only the last row in a table in the dataframe 'DataAll'
i need full table to be plotted on the Dataframe
I made iterations to scrape data from multiple table to a single dataframe
please help me so that i can get all the contents in dataframe
Url = https://agmarknet.gov.in/SearchCmmMkt.aspx?Tx_Commodity=137&Tx_State=0&Tx_District=0&Tx_Market=0&DateFrom=01-jan-2016&DateTo=19-nov-2019&Fr_Date=01-jan-2016&To_Date=19-nov-2019&Tx_Trend=2&Tx_CommodityHead=Ajwan&Tx_StateHead=--Select--&Tx_DistrictHead=--Select--&Tx_MarketHead=--Select--

Web-Scraping Python, Indexing Issue for DataFrame

I'm working on a web-scraper for Spotify Charts to extract the top 200 daily songs each day. I have done everything to extract the data I'm interested in including rank, artist, track title, and stream numbers. What I'm stuck on is putting everything into a DataFrame to export as a CSV to excel. Right now when I print my DataFrame, it is treating each cycle as 1 row with 4 columns as opposed to 200 rows with 4 columns.
I'm not sure what the issue is as I've tried just about everything and looked into it as much as I could. I know something is wrong with the indexing because each "what should be a row" has the same first "0" index, when they should go sequential to 199. Also, the column names for my DataFrame keep repeating after each "what should be a row", so I know there is definitely an issue there.
import requests
from bs4 import BeautifulSoup
from datetime import date, timedelta
from time import time
from time import sleep
from random import randint
import pandas as pd
import numpy as np
base_url = 'https://spotifycharts.com/regional/global/daily/'
r = requests.get(base_url)
soup = BeautifulSoup(r.text, 'html.parser')
chart = soup.find('table', {'class': 'chart-table'})
tbody = chart.find('tbody')
for tr in tbody.find_all('tr'):
rank_text = []
rank_text_elem = tr.find('td', {'class': 'chart-table-
position'})
for item in rank_text_elem:
rank_text = []
rank_text.append(item)
artist_text = []
artist_text_elem = tr.find('td', {'class': 'chart-table-
track'}).find_all('span')
for item in artist_text_elem:
artist_text = []
artist_text.append(item.text.replace('by ','').strip())
title_text = []
title_text_elem = tr.find('td', {'class': 'chart-table-
track'}).find_all('strong')
for item in title_text_elem:
title_text = []
title_text.append(item.text)
streams_text = []
streams_text_elem = tr.find('td', {'class': 'chart-table-streams'})
for item in streams_text_elem:
streams_text = []
streams_text.append(item)
# creating dataframe to store 4 variables
list_of_data = list(zip(rank_text, artist_text, title_text,
streams_text))
df = pd.DataFrame(list_of_data, columns =
['Rank','Artist','Title','Streams'])
print(df)
Basically, I'm trying to create a dataframe to hold 4 variables in each row for 200 rows for each date of spotify global charts. Please ignore some of the modules and libraries I've included at the top, they are used for iterating through each page of the historical data based on dynamic urls which I have already figured out. Any help is greatly appreciated! Thank you!
Before for loop I create list all_rows.
Inside for loop I add list with single row of data to all_rows.
After for loop I use all_rows to create DataFrame
import requests
from bs4 import BeautifulSoup
import pandas as pd
base_url = 'https://spotifycharts.com/regional/global/daily/'
r = requests.get(base_url)
soup = BeautifulSoup(r.text, 'html.parser')
chart = soup.find('table', {'class': 'chart-table'})
tbody = chart.find('tbody')
all_rows = []
for tr in tbody.find_all('tr'):
rank_text = tr.find('td', {'class': 'chart-table-position'}).text
artist_text = tr.find('td', {'class': 'chart-table-track'}).find('span').text
artist_text = artist_text.replace('by ','').strip()
title_text = tr.find('td', {'class': 'chart-table-track'}).find('strong').text
streams_text = tr.find('td', {'class': 'chart-table-streams'}).text
all_rows.append( [rank_text, artist_text, title_text, streams_text] )
# after `for` loop
df = pd.DataFrame(all_rows, columns=['Rank','Artist','Title','Streams'])
print(df.head())
You could use pandas and requests
import pandas as pd
import requests
headers = {'User-Agent': 'Mozilla/5.0'}
url ='https://spotifycharts.com/regional/global/daily/'
r = requests.get(url, headers = headers).content
table = pd.read_html(r)[0] #transfer html to pandas
table.dropna(axis = 1, how = 'all', inplace = True) #drop nan column
table[['Title','Artist']] = table['Unnamed: 3'].str.split(' by ',expand=True) #split title artist strings into two columns
del table['Unnamed: 3'] #remove combined column
table = table[['Track', 'Artist','Title', 'Unnamed: 4']] #re-order cols
table.columns= ['Rank', 'Artist','Title', 'Streams'] #rename cols
print(table)

Categories