Always making a new DataFrame

Always making a new DataFrame - python

I followed a tutorial but I didn't like the result, so I am trying to optimise it but I can't seem to find a way around always making a new dataframe. And I know it is a result from the while loop.
So what i want is the price to append to the dataframe i made.
Thanks in advance!
import pandas as pd
import bs4
import requests
from bs4 import BeautifulSoup
import datetime
#getting actual price
def Real_time_Price(stock):
url = ('https://finance.yahoo.com/quote/'+stock+'?p='+stock)
r=requests.get(url)
web_content=BeautifulSoup(r.text, 'lxml')
web_content = web_content.find('div',{'class':"My(6px) Pos(r) smartphone_Mt(6px)"})
web_content = web_content.find('span').text
return web_content
and here is where my problem starts
while True:
price = []
col = []
time_stamp = datetime.datetime.now()
#de milli seconden wegknallen.
time_stamp = time_stamp.strftime("%Y-%m-%d %H:%M:%S")
#welke stocks wilje checken
ticker_symbols = ['TSLA','AAPL','MSFT']
for stock in ticker_symbols:
price.append(Real_time_Price(stock))
#getting it into a pandas dataframe
#You want [data] for pandas to understand they're rows.
df=pd.DataFrame(data=[price], index=[time_stamp], columns=ticker_symbols)
print(df)

Create dataframe once and use DataFrame.loc([]) to append
df=pd.DataFrame(index=[time_stamp], columns=ticker_symbols)
while True:
price = []
col = []
time_stamp = datetime.datetime.now()
#de milli seconden wegknallen.
time_stamp = time_stamp.strftime("%Y-%m-%d %H:%M:%S")
#welke stocks wilje checken
ticker_symbols = ['TSLA','AAPL','MSFT']
for stock in ticker_symbols:
price.append(Real_time_Price(stock))
df.loc[time_stamp]=price

Related

Scrape Historical Bitcoin Data from Coinmarketcap with BeautifulSoup

I'm trying to scrape Historical Bitcoin Data from coinmarketcap.com in order to get close, volume, date, high and low values since the beginning of the year until Sep 30, 2021. After going through threads and videos for hours, and I'm new to scraping with Python, I don't know what my mistake is (or is there something with the website I don't detect?). The following is my code:
from bs4 import BeautifulSoup
import requests
import pandas as pd
closeList = []
volumeList = []
dateList = []
highList = []
lowList = []
website = 'https://coinmarketcap.com/currencies/bitcoin/historical-data/'
r = requests.get(website)
r = requests.get(website)
soup = BeautifulSoup(r.text, 'lxml')
tr = soup.find_all('tr')
FullData = []
for item in tr:
closeList.append(item.find_all('td')[4].text)
volumeList.append(item.find_all('td')[5].text)
dateList.append(item.find('td',{'style':'text-align: left;'}).text)
highList.append(item.find_all('td')[2].text)
lowList.append(item.find_all('td')[3].text)
FullData.append([closeList,volumeList,dateList,highList,lowList])
df_columns = ["close", "volume", "date", "high", "low"]
df = pd.DataFrame(FullData, columns = df_columns)
print(df)
As a result I only get:
Empty DataFrame
Columns: [close, volume, date, high, low]
Index: []
The task obliges me to scrape with BeautifulSoup and then export to csv (which obviously then is simply df.to_csv - can somebody help me out? That would be highly appreciated.

Actually, data is loaded dynamically by javascript from api calls json response. So you can grab data easily as follows:
Code:
import requests
import json
import pandas as pd
api_url= 'https://api.coinmarketcap.com/data-api/v3/cryptocurrency/historical?id=1&convertId=2781&timeStart=1632441600&timeEnd=1637712000'
r = requests.get(api_url)
data = []
for item in r.json()['data']['quotes']:
close = item['quote']['close']
volume =item['quote']['volume']
date=item['quote']['timestamp']
high=item['quote']['high']
low=item['quote']['low']
data.append([close,volume,date,high,low])
cols = ["close", "volume","date","high","low"]
df = pd.DataFrame(data, columns= cols)
print(df)
#df.to_csv('info.csv',index = False)
Output:
close volume date high low
0 42839.751696 4.283935e+10 2021-09-24T23:59:59.999Z 45080.491063 40936.557169
1 42716.593147 3.160472e+10 2021-09-25T23:59:59.999Z 42996.259704 41759.920425
2 43208.539105 3.066122e+10 2021-09-26T23:59:59.999Z 43919.300970 40848.461660
3 42235.731847 3.098003e+10 2021-09-27T23:59:59.999Z 44313.245882 42190.632576
4 41034.544665 3.021494e+10 2021-09-28T23:59:59.999Z 42775.146142 40931.662500
.. ... ... ... ... ...
56 58119.576194 3.870241e+10 2021-11-19T23:59:59.999Z 58351.113266 55705.180685
57 59697.197134 3.062426e+10 2021-11-20T23:59:59.999Z 59859.880442 57469.725661
58 58730.476639 2.612345e+10 2021-11-21T23:59:59.999Z 60004.426383 58618.931432
59 56289.287323 3.503612e+10 2021-11-22T23:59:59.999Z 59266.358468 55679.840404
60 57569.074876 3.748580e+10 2021-11-23T23:59:59.999Z 57875.516397 55632.759912
[61 rows x 5 columns]

Why is this code not downloading the data in real time?

I have a problem which I don't know how to solve (I'm a beginner in coding). This program is supposed to scrape stock price data from yahoo finance:
import bs4
from bs4 import BeautifulSoup
import requests
import pandas as pd
import datetime as dt
def real_time_price(stock_code):
url = 'https://finance.yahoo.com/quote/' + stock_code + '/'
r = requests.get(url)
web_content = BeautifulSoup(r.text, 'lxml')
web_content = web_content.find('div', {'class':'My(6px) Pos(r) smartphone_Mt(6px)'})
web_content = web_content.find('span').text
if web_content==[]:
web_content = '999999'
return web_content
LA = ['AAPL', 'FB', 'F', 'AMZN', 'GOOG']
for step in range(1,101):
price = []
col = []#Lista, która dodaje dane do df
time_stamp = dt.datetime.now()
time_stamp = time_stamp.strftime('%Y-%m-%d %H:%M:%S')
for stock_code in LA:
price.append(real_time_price(stock_code))
col = [time_stamp]
col.extend(price)
df = pd.DataFrame(col)
df = df.T
df.to_csv('realtimestockdata.csv', mode = 'a', header = False)
print(col)
But it seems that it does not update when it's running is there some syntactic error in that that I missed?
All responses are really appriciated, thank you.

Not Parsing all data - only header of table

Just not fully understanding the datetime import yet as when i parse over to get data i'm not able to get full table data
from datetime import datetime, date, timedelta
import requests
import re
from bs4 import BeautifulSoup
base_url = "http://www.harness.org.au/racing/results/?firstDate="
webpage_response = requests.get('http://www.harness.org.au/racing/results/?firstDate=')
soup = BeautifulSoup(webpage_response.content, "html.parser")
format = "%d-%m-%y"
delta = timedelta(days=1)
yesterday = datetime.today() - timedelta(days=1)
yesterday1 = yesterday.strftime(format)
enddate = datetime(2018, 1, 1)
enddate1 = enddate.strftime(format)
while enddate <= yesterday:
enddate += timedelta(days=1)
enddate.strftime(format)
new_url = base_url + str(enddate)
soup12 = requests.get(new_url)
soup1 = BeautifulSoup(soup12.content, "html.parser")
table1 = soup1.find('table', class_='meetingListFull')
for table2 in table1.find('td'):
name = table2.find('a')
i want to re-iterate over all names from datelist to eventually get all href and scrape data from all past results. Below is actually what i want to get from table1 data but it was not showing up.
Globe Derby Park
So purpose is create href to reiterate over to get all href for past 2 years, re-iterate over tables and then get data from each href below

You can try the following code for your loop:
for tr in table1.find_all('tr'):
all_cells = tr.find_all('td')
if all_cells:
name_cell = all_cells[0]
try:
text = name_cell.a.text.strip()
except:
continue
else:
print(text)
find_all returns an iterable list and since you only look for a name, just use the first cell.
Hope that helps.

Saving multiple data frames from loop

I have been searching for a solution to my problem, but all answers I find uses print() at the end of the answer, and NOT saving the data frames as I would like to.
Below I have a (almost) functioning code that prints 3 seperate tables. How do I save these three tables in 3 seperate data frames with the names matches_october, matches_november and matches_december?
The last line in my code is not working as I want it to work. I hope it is clear what I would like the code to do (Saving a data frame at the end of each of the 3 rounds in the loop)
import pandas as pd
import requests
from bs4 import BeautifulSoup
base_url = 'https://www.basketball-reference.com/leagues/NBA_2019_games-'
valid_pages = ['october','november','december']
end = '.html'
for i in valid_pages:
url = '{}{}{}'.format(base_url, i, end)
res = requests.get(url)
soup = BeautifulSoup(res.content,'lxml')
table = soup.find_all('table')[0]
df = pd.read_html(str(table))
print(df)
matches + valid_pages = df[0]

You can case it, but that's not very robust (and it's rather ugly).
if i == 'october':
matches_october = pd.read_html(str(table))
if i == 'november':
# so on and so forth
A more elegant solution is to use a dictionary. Before the loop, declare matches = {}. Then, in each iteration:
matches[i] = pd.read_html(str(table))
Then you can access the October matches DataFrame via matches['october'].

You can't compose variable names using +, try using a dict instead:
import pandas as pd
import requests
from bs4 import BeautifulSoup
matches = {} # create an empty dict
base_url = 'https://www.basketball-reference.com/leagues/NBA_2019_games-'
valid_pages = ['october','november','december']
end = '.html'
for i in valid_pages:
url = '{}{}{}'.format(base_url, i, end)
res = requests.get(url)
soup = BeautifulSoup(res.content,'lxml')
table = soup.find_all('table')[0]
df = pd.read_html(str(table))
print(df)
matches[i] = df[0] # store it in the dict

Thanks guys. That worked! :)
import pandas as pd
import requests
from bs4 import BeautifulSoup
matches = {} # create an empty dict
base_url = 'https://www.basketball-reference.com/leagues/NBA_2019_games-'
valid_pages = ['october','november','december']
end = '.html'
for i in valid_pages:
url = '{}{}{}'.format(base_url, i, end)
res = requests.get(url)
soup = BeautifulSoup(res.content,'lxml')
table = soup.find_all('table')[0]
df = pd.read_html(str(table))
matches[i] = df[0] # store it in the dict
matches_october = matches['october']

Web-Scraping Python, Indexing Issue for DataFrame

I'm working on a web-scraper for Spotify Charts to extract the top 200 daily songs each day. I have done everything to extract the data I'm interested in including rank, artist, track title, and stream numbers. What I'm stuck on is putting everything into a DataFrame to export as a CSV to excel. Right now when I print my DataFrame, it is treating each cycle as 1 row with 4 columns as opposed to 200 rows with 4 columns.
I'm not sure what the issue is as I've tried just about everything and looked into it as much as I could. I know something is wrong with the indexing because each "what should be a row" has the same first "0" index, when they should go sequential to 199. Also, the column names for my DataFrame keep repeating after each "what should be a row", so I know there is definitely an issue there.
import requests
from bs4 import BeautifulSoup
from datetime import date, timedelta
from time import time
from time import sleep
from random import randint
import pandas as pd
import numpy as np
base_url = 'https://spotifycharts.com/regional/global/daily/'
r = requests.get(base_url)
soup = BeautifulSoup(r.text, 'html.parser')
chart = soup.find('table', {'class': 'chart-table'})
tbody = chart.find('tbody')
for tr in tbody.find_all('tr'):
rank_text = []
rank_text_elem = tr.find('td', {'class': 'chart-table-
position'})
for item in rank_text_elem:
rank_text = []
rank_text.append(item)
artist_text = []
artist_text_elem = tr.find('td', {'class': 'chart-table-
track'}).find_all('span')
for item in artist_text_elem:
artist_text = []
artist_text.append(item.text.replace('by ','').strip())
title_text = []
title_text_elem = tr.find('td', {'class': 'chart-table-
track'}).find_all('strong')
for item in title_text_elem:
title_text = []
title_text.append(item.text)
streams_text = []
streams_text_elem = tr.find('td', {'class': 'chart-table-streams'})
for item in streams_text_elem:
streams_text = []
streams_text.append(item)
# creating dataframe to store 4 variables
list_of_data = list(zip(rank_text, artist_text, title_text,
streams_text))
df = pd.DataFrame(list_of_data, columns =
['Rank','Artist','Title','Streams'])
print(df)
Basically, I'm trying to create a dataframe to hold 4 variables in each row for 200 rows for each date of spotify global charts. Please ignore some of the modules and libraries I've included at the top, they are used for iterating through each page of the historical data based on dynamic urls which I have already figured out. Any help is greatly appreciated! Thank you!

Before for loop I create list all_rows.
Inside for loop I add list with single row of data to all_rows.
After for loop I use all_rows to create DataFrame
import requests
from bs4 import BeautifulSoup
import pandas as pd
base_url = 'https://spotifycharts.com/regional/global/daily/'
r = requests.get(base_url)
soup = BeautifulSoup(r.text, 'html.parser')
chart = soup.find('table', {'class': 'chart-table'})
tbody = chart.find('tbody')
all_rows = []
for tr in tbody.find_all('tr'):
rank_text = tr.find('td', {'class': 'chart-table-position'}).text
artist_text = tr.find('td', {'class': 'chart-table-track'}).find('span').text
artist_text = artist_text.replace('by ','').strip()
title_text = tr.find('td', {'class': 'chart-table-track'}).find('strong').text
streams_text = tr.find('td', {'class': 'chart-table-streams'}).text
all_rows.append( [rank_text, artist_text, title_text, streams_text] )
# after `for` loop
df = pd.DataFrame(all_rows, columns=['Rank','Artist','Title','Streams'])
print(df.head())

You could use pandas and requests
import pandas as pd
import requests
headers = {'User-Agent': 'Mozilla/5.0'}
url ='https://spotifycharts.com/regional/global/daily/'
r = requests.get(url, headers = headers).content
table = pd.read_html(r)[0] #transfer html to pandas
table.dropna(axis = 1, how = 'all', inplace = True) #drop nan column
table[['Title','Artist']] = table['Unnamed: 3'].str.split(' by ',expand=True) #split title artist strings into two columns
del table['Unnamed: 3'] #remove combined column
table = table[['Track', 'Artist','Title', 'Unnamed: 4']] #re-order cols
table.columns= ['Rank', 'Artist','Title', 'Streams'] #rename cols
print(table)

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Always making a new DataFrame - python

Related

Scrape Historical Bitcoin Data from Coinmarketcap with BeautifulSoup

Why is this code not downloading the data in real time?

Not Parsing all data - only header of table

Saving multiple data frames from loop

Web-Scraping Python, Indexing Issue for DataFrame

Categories

Resources