Adding new column to DataFrame using a list

Adding new column to DataFrame using a list - python

I am scraping the stock prices, and names from Yahoo's finance website. After making a dataframe with three columns "Name", "Code", and "Price" and representing the passed index variable. I want to go to another loop and add a column to the original dataframe with updated prices. But when I add the column it creates NaN values for my original data. What do I need to do to correctly place the indexes and not disturb the original dataframe data?
class Stocks():
def __init__(self):
return self
def Stock_ABV(str):
pattern = re.compile(r'/([A-Z]*-[A-Z]*|[A-Z]*)')
match = pattern.finditer(str)
length = len(str)
for match in match:
name = match.group(0)
return match.group(0)[1:length]
def Yahoo_Finance(index):
url_list = ['https://finance.yahoo.com/quote/GOOG','https://finance.yahoo.com/quote/DOGE-USD',
'https://finance.yahoo.com/quote/AAPL', 'https://finance.yahoo.com/quote/HMC',
'https://finance.yahoo.com/quote/TM', 'https://finance.yahoo.com/quote/DKS',
'https://finance.yahoo.com/quote/SHIB-USD', 'https://finance.yahoo.com/quote/BTC-USD',
'https://finance.yahoo.com/quote/WMT', 'https://finance.yahoo.com/quote/AMZN',
'https://finance.yahoo.com/quote/NKE', 'https://finance.yahoo.com/quote/KO',
'https://finance.yahoo.com/quote/PEP', 'https://finance.yahoo.com/quote/DAL',
'https://finance.yahoo.com/quote/SAVE', 'https://finance.yahoo.com/quote/BLL',
'https://finance.yahoo.com/quote/KMB', 'https://finance.yahoo.com/quote/GIS']
url = ''
i = 0
L1 = []
L2 = []
structure = pd.DataFrame({'Name': [], 'Code': [], 'Price': []})
if index == 1:
while i < len(url_list):
url = url + url_list[i]
req = requests.get(url)
soup = BeautifulSoup(req.text, 'html.parser')
name = soup.find('h1', {'class': 'D(ib) Fz(18px)'}).text
price = soup.find('fin-streamer', {'class': 'Fw(b) Fz(36px) Mb(-4px) D(ib)'}).text
L1.append([name,Stocks.Stock_ABV(url_list[i]),price])
df = pd.DataFrame(L1, columns = ['Name', 'Code', 'Price'])
i += 1
url = ''
structure = df
structure = structure.set_index(df.index)
else:
while i < len(url_list):
req = requests.get(url_list[i])
soup = BeautifulSoup(req.text, 'html.parser')
price = soup.find('fin-streamer', {'class': 'Fw(b) Fz(36px) Mb(-4px) D(ib)'}).text
L2.append(price)
df2 = pd.DataFrame(L2, columns = [f'Price{index}'])
i += 1
url = ''
structure[f'Price{index}'] = L2
pd.set_option('display.max_rows', None)
return structure
def AFK_Runner():
Stocks.Yahoo_Finance(1)
return Stocks.Yahoo_Finance(2)
Stocks.AFK_Runner()

Do you know the yfinance package?
# pip install yfinance
import yfinance as yf
data = yf.download('GOOG DOGE-USD AAPL HMC')
Output:
Adj Close Close ... Open Volume
AAPL DOGE-USD GOOG HMC AAPL DOGE-USD GOOG HMC ... AAPL DOGE-USD GOOG HMC AAPL DOGE-USD GOOG HMC
Date ...
1980-03-17 NaN NaN NaN 0.718973 NaN NaN NaN 0.893750 ... NaN NaN NaN 0.893750 NaN NaN NaN 26000.0
1980-03-18 NaN NaN NaN 0.731542 NaN NaN NaN 0.909375 ... NaN NaN NaN 0.909375 NaN NaN NaN 2000.0
1980-03-19 NaN NaN NaN 0.724001 NaN NaN NaN 0.900000 ... NaN NaN NaN 0.900000 NaN NaN NaN 2000.0
1980-03-20 NaN NaN NaN 0.724001 NaN NaN NaN 0.900000 ... NaN NaN NaN 0.900000 NaN NaN NaN 0.0
1980-03-21 NaN NaN NaN 0.724001 NaN NaN NaN 0.900000 ... NaN NaN NaN 0.900000 NaN NaN NaN 2000.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2022-02-09 176.279999 0.159123 2829.060059 30.500000 176.279999 0.159123 2829.060059 30.500000 ... 176.050003 0.158357 2816.995117 30.120001 71285000.0 7.786708e+08 1431400.0 1554600.0
2022-02-10 172.119995 0.151889 2772.050049 30.760000 172.119995 0.151889 2772.050049 30.760000 ... 174.139999 0.159145 2790.000000 31.000000 90865900.0 1.053631e+09 1650900.0 1398400.0
2022-02-11 168.639999 0.144847 2682.600098 30.459999 168.639999 0.144847 2682.600098 30.459999 ... 172.330002 0.151895 2775.000000 30.760000 98566000.0 7.767306e+08 1937700.0 1004200.0
2022-02-12 NaN 0.144405 NaN NaN NaN 0.144405 NaN NaN ... NaN 0.144856 NaN NaN NaN 6.026994e+08 NaN NaN
2022-02-13 NaN 0.153793 NaN NaN NaN 0.153793 NaN NaN ... NaN 0.144308 NaN NaN NaN 1.346092e+09 NaN NaN
[11055 rows x 24 columns]

Related

Beautifulsoup: Scrape Table with Key Word Search

I'm trying to scrape tables from multiple websites with key words. I want to scrape values from table which fulfill "Cash and cash equivalent" as row header and "2020" as column header at the same time in order to print to excel file in the future. But I cannot get the code work. Hope you can help me on this! Thank you!!
from bs4 import BeautifulSoup
import requests
import time
from pandas import DataFrame
import pandas as pd
#headers={"Content-Type":"text"}
headers = {'User-Agent': 'registr#jh.edu'}
urls={'https://www.sec.gov/Archives/edgar/data/1127993/0001091818-21-000003.txt',
'https://www.sec.gov/Archives/edgar/data/1058307/0001493152-21-003451.txt'}
Cash=[]
for url in urls:
response = requests.get(url, headers = headers)
response.raise_for_status()
time.sleep(0.1)
soup = BeautifulSoup(response.text,'lxml')
for table in soup.find_all('table'):
for tr in table.find_all('tr'):
row = [td.get_text(strip=True) for td in tr.find_all('td')]
headers = [header.get_text(strip=True).encode("utf-8") for header in tr[0].find_all("th")]
try:
if '2020' in headers[0]:
if row[0] == 'Cash and cash equivalent':
Cash_and_cash_equivalent = f'{url}'+ ' ' + headers+ str(row)
Cash.append(Cash_and_cash_equivalent)
if row[0] == 'Cash':
Cash_ = f'{url}'+ ' ' + headers+ str(row)
Cash.append(Cash_)
except IndexError:
continue
print(Cash)

You could do something along these lines:
import requests
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
headers = {'User-Agent': 'registr#jh.edu'}
r = requests.get('https://www.sec.gov/Archives/edgar/data/1127993/0001091818-21-000003.txt', headers=headers)
dfs = pd.read_html(str(r.text))
for x in range(len(dfs)):
if dfs[x].apply(lambda row: row.astype(str).str.contains('Cash and Cash Equivalents').any(), axis=1).any():
df = dfs[x]
df.dropna(how='all')
new_header = df.iloc[2]
df = df[3:]
df.columns = new_header
display(df) ## or print(df) if you're not in a jupyter notebook
This will return two dataframes, with tables #37 and respectively #71. You may need to improve the table header detection, as only table #71 will come out with proper headers (years).
I tried to look at the second url, however it was hanging for me (huge page).
The printout in terminal will look something like this:
NaN NaN 2020 NaN 2019
3 Cash Flows from Operating Activities NaN NaN NaN NaN
4 Net loss NaN $(13,134,778) NaN $ (2,017,347)
5 Adjustments to reconcile net loss to net cash used in operating activities: NaN NaN NaN NaN
6 Depreciation and amortization NaN 84940 NaN 7832
7 Amortization of convertible debt discounts NaN 74775 NaN 60268
8 Accretion and settlement of financing instruments NaN NaN NaN NaN
9 and change in fair value of derivative liability NaN 1381363 NaN (1,346,797)
10 Stock compensation and stock issued for services NaN 2870472 NaN -
11 Stock issued under Put Purchase Agreement NaN 7865077 NaN -
12 NaN NaN NaN NaN NaN
13 Changes in assets and liabilities: NaN NaN NaN NaN
14 Accounts receivable NaN (696,710) NaN 82359
15 Inventories NaN (78,919) NaN 304970
16 Accounts payable NaN (1,462,072) NaN (22,995)
17 Accrued expenses NaN (158,601) NaN (346,095)
18 Deferred revenue NaN 431147 NaN (91,453)
19 Net cash used in operating activities NaN (2,823,306) NaN (3,369,258)
20 NaN NaN NaN NaN NaN
21 Cash Flows from Investing Activities NaN NaN NaN NaN
22 Acquisition of business, net of cash NaN - NaN 2967918
23 Purchases of property and equipment NaN - NaN (17,636)
24 Net cash provided by investing activities NaN - NaN 2950282
25 NaN NaN NaN NaN NaN
26 Cash Flows from Financing Activities NaN NaN NaN NaN
27 Principal payments on financing lease obligations NaN - NaN (1,649)
28 Principal payments on notes payable NaN (774) NaN -
29 Payments on advances from stockholder, net NaN (33,110) NaN -
30 Proceeds from convertible notes payable NaN 840000 NaN 667000
31 Payments on line of credit, net NaN (300,000) NaN -
32 Proceeds from sale of common stock under Purchase Agreement NaN 2316520 NaN -
33 Net cash provided by financing activities NaN 2822636 NaN 665351
34 NaN NaN NaN NaN NaN
35 Net Increase (Decrease) in Cash and Cash Equivalents NaN (670) NaN 246375
36 NaN NaN NaN NaN NaN
37 Cash, Beginning of Period NaN 412391 NaN 169430
38 NaN NaN NaN NaN NaN
39 Cash, End of Period NaN $ 411,721 NaN $ 415,805

pandas.read_html tables not found

I'm trying to get a list of the major world indices in Yahoo Finance at this URL: https://finance.yahoo.com/world-indices.
I tried first to get the indices in a table by just running
major_indices=pd.read_html("https://finance.yahoo.com/world-indices")[0]
In this case the error was:
ValueError: No tables found
So I read a solution using selenium at pandas read_html - no tables found
the solution they came up with is (with some adjustment):
from selenium import webdriver
import pandas as pd
from selenium.webdriver.common.keys import Keys
from webdrivermanager.chrome import ChromeDriverManager
driver = webdriver.Chrome(ChromeDriverManager().download_and_install())
driver.get("https://finance.yahoo.com/world-indices")
html = driver.page_source
tables = pd.read_html(html)
data = tables[1]
Again this code gave me another error:
ValueError: No tables found
I don't know whether to keep using selenium or the pd.read_html is just fine. Either way I'm trying to get this data and don't know how to procede. Can anyone help me?

You don't need Selenium here, you just have to set the euConsentId cookie:
import pandas as pd
import requests
import uuid
url = 'https://finance.yahoo.com/world-indices'
cookies = {'euConsentId': str(uuid.uuid4())}
html = requests.get(url, cookies=cookies).content
df = pd.read_html(html)[0]
Output:
>>> df
Symbol Name Last Price Change % Change Volume Intraday High/Low 52 Week Range Day Chart
0 ^GSPC S&P 500 4023.89 93.81 +2.39% 2.545B NaN NaN NaN
1 ^DJI Dow 30 32196.66 466.36 +1.47% 388.524M NaN NaN NaN
2 ^IXIC Nasdaq 11805.00 434.04 +3.82% 5.15B NaN NaN NaN
3 ^NYA NYSE COMPOSITE (DJ) 15257.36 326.26 +2.19% 0 NaN NaN NaN
4 ^XAX NYSE AMEX COMPOSITE INDEX 4025.81 122.66 +3.14% 0 NaN NaN NaN
5 ^BUK100P Cboe UK 100 739.68 17.83 +2.47% 0 NaN NaN NaN
6 ^RUT Russell 2000 1792.67 53.28 +3.06% 0 NaN NaN NaN
7 ^VIX CBOE Volatility Index 28.87 -2.90 -9.13% 0 NaN NaN NaN
8 ^FTSE FTSE 100 7418.15 184.81 +2.55% 0 NaN NaN NaN
9 ^GDAXI DAX PERFORMANCE-INDEX 14027.93 288.29 +2.10% 0 NaN NaN NaN
10 ^FCHI CAC 40 6362.68 156.42 +2.52% 0 NaN NaN NaN
11 ^STOXX50E ESTX 50 PR.EUR 3703.42 89.99 +2.49% 0 NaN NaN NaN
12 ^N100 Euronext 100 Index 1211.74 28.89 +2.44% 0 NaN NaN NaN
13 ^BFX BEL 20 3944.56 14.35 +0.37% 0 NaN NaN NaN
14 IMOEX.ME MOEX Russia Index 2307.50 9.61 +0.42% 0 NaN NaN NaN
15 ^N225 Nikkei 225 26427.65 678.93 +2.64% 0 NaN NaN NaN
16 ^HSI HANG SENG INDEX 19898.77 518.43 +2.68% 0 NaN NaN NaN
17 000001.SS SSE Composite Index 3084.28 29.29 +0.96% 3.109B NaN NaN NaN
18 399001.SZ Shenzhen Component 11159.79 64.92 +0.59% 3.16B NaN NaN NaN
19 ^STI STI Index 3191.16 25.98 +0.82% 0 NaN NaN NaN
20 ^AXJO S&P/ASX 200 7075.10 134.10 +1.93% 0 NaN NaN NaN
21 ^AORD ALL ORDINARIES 7307.70 141.10 +1.97% 0 NaN NaN NaN
22 ^BSESN S&P BSE SENSEX 52793.62 -136.69 -0.26% 0 NaN NaN NaN
23 ^JKSE Jakarta Composite Index 6597.99 -1.85 -0.03% 0 NaN NaN NaN
24 ^KLSE FTSE Bursa Malaysia KLCI 1544.41 5.61 +0.36% 0 NaN NaN NaN
25 ^NZ50 S&P/NZX 50 INDEX GROSS 11168.18 -9.18 -0.08% 0 NaN NaN NaN
26 ^KS11 KOSPI Composite Index 2604.24 54.16 +2.12% 788539 NaN NaN NaN
27 ^TWII TSEC weighted index 15832.54 215.86 +1.38% 0 NaN NaN NaN
28 ^GSPTSE S&P/TSX Composite index 20099.81 400.76 +2.03% 294.637M NaN NaN NaN
29 ^BVSP IBOVESPA 106924.18 1236.54 +1.17% 0 NaN NaN NaN
30 ^MXX IPC MEXICO 49579.90 270.58 +0.55% 212.868M NaN NaN NaN
31 ^IPSA S&P/CLX IPSA 5058.88 0.00 0.00% 0 NaN NaN NaN
32 ^MERV MERVAL 38390.84 233.89 +0.61% 0 NaN NaN NaN
33 ^TA125.TA TA-125 1964.95 23.38 +1.20% 0 NaN NaN NaN
34 ^CASE30 EGX 30 Price Return Index 10642.40 -213.50 -1.97% 36.837M NaN NaN NaN
35 ^JN0U.JO Top 40 USD Net TRI Index 4118.19 65.63 +1.62% 0 NaN NaN NaN

AssertionError when use df.loc in python

I created a script to load data, check NA values, and fill all NA values. Here is my code:
import pandas as pd
def filter_df(merged_df, var_list):
ind = merged_df.Name.isin(var_list)
return merged_df[ind]
def pivot_df(df):
return df.pivot(index='Date', columns='Name', values=['Open', 'High', 'Low', 'Close'])
def validation_df(input, summary = False):
df = input.copy()
# na check
missing = df.isna().sum().sort_values(ascending=False)
percent_missing = ((missing / df.isnull().count()) * 100).sort_values(ascending=False)
missing_df = pd.concat([missing, percent_missing], axis=1, keys=['Total', 'Percent'], sort=False)
# fill na
columns = list(missing_df[missing_df['Total'] >= 1].reset_index()['index'])
for col in columns:
null_index = df.index[df[col].isnull() == True].tolist()
null_index.sort()
for ind in null_index:
if ind > 0:
print(df.loc[ind, col])
print(df.loc[ind - 1, col])
df.loc[ind, col] = df.loc[ind - 1, col]
if ind == 0:
df.loc[ind, col] = 0
# outliers check
count = []
for col in df.columns:
count.append(sum(df[col] > df[col].mean() + 2 * df[col].std()) + sum(df[col] < df[col].mean() - 2 * df[col].std()))
outliers_df = pd.DataFrame({'Columns': df.columns, 'Count': count}).sort_values(by = 'Count')
if summary == True:
print('missing value check:/n')
print(missing_df)
print('/n outliers check:/n')
print(outliers_df)
return df
def join_df(price_df, transaction_df, var_list):
price_df = filter_df(price_df, var_list)
price_df = pivot_df(price_df)
joined_df = transaction_df.merge(price_df, how = 'left', on = 'Date')
#joined_df = validation_df(joined_df)
return joined_df
token_path = 'https://raw.githubusercontent.com/Carloszone/Cryptocurrency_Research_project/main/datasets/1_token_df.csv'
transaction_path = 'https://raw.githubusercontent.com/Carloszone/Cryptocurrency_Research_project/main/datasets/transaction_df.csv'
var_list = ['Bitcoin', 'Ethereum', 'Golem', 'Solana']
token_df = pd.read_csv(token_path)
transaction_df = pd.read_csv(transaction_path)
df = join_df(token_df, transaction_df, var_list)
df = validation_df(df)
But it did not work. I checked my code and found this issue came from the loc(). For example:
df = join_df(token_df, transaction_df, var_list)
print(df[df.columns[15]])
print(df.loc[1,df.columns[15]])
what I got is:
0 NaN
1 NaN
2 NaN
3 NaN
4 NaN
..
2250 NaN
2251 NaN
2252 NaN
2253 NaN
2254 NaN
Name: (High, Solana), Length: 2255, dtype: float64
AssertionError Traceback (most recent call last)
<ipython-input-19-75f01cc22c9c> in <module>()
2
3 print(df[df.columns[15]])
----> 4 print(df.loc[1,df.columns[15]])
2 frames
/usr/local/lib/python3.7/dist-packages/pandas/core/indexing.py in __getitem__(self, key)
923 with suppress(KeyError, IndexError):
924 return self.obj._get_value(*key, takeable=self._takeable)
--> 925 return self._getitem_tuple(key)
926 else:
927 # we by definition only have the 0th axis
/usr/local/lib/python3.7/dist-packages/pandas/core/indexing.py in _getitem_tuple(self, tup)
1107 return self._multi_take(tup)
1108
-> 1109 return self._getitem_tuple_same_dim(tup)
1110
1111 def _get_label(self, label, axis: int):
/usr/local/lib/python3.7/dist-packages/pandas/core/indexing.py in _getitem_tuple_same_dim(self, tup)
807 # We should never have retval.ndim < self.ndim, as that should
808 # be handled by the _getitem_lowerdim call above.
--> 809 assert retval.ndim == self.ndim
810
811 return retval
AssertionError:
I don't know why df[column_name] is available, but df.loc[index,columns_name] is wrong.
You can check my code on Colab: https://colab.research.google.com/drive/1Yg280JRwFayW1tdp4OJqTO5-X3dGsItB?usp=sharing

The issue is that you're merging two DataFrames on a column they don't share in common (because you pivoted price_df, Date column became the index). Also the Date columns don't have a uniform format, so you have to make them the same. Replace your join_df function with the one below and it will work as expected.
I added comments on the lines that had to be added.
def join_df(price_df, transaction_df, var_list):
price_df = filter_df(price_df, var_list)
price_df = pivot_df(price_df)
# After pivot the Date column is the index, and price_df has MultiIndex columns
# since we want to merge it with transaction_df, we need to first flatten the columns
price_df.columns = price_df.columns.map('.'.join)
# and reset_index so that we have the index as the Date column
price_df = price_df.reset_index()
# the Dates are formatted differently across the two DataFrames;
# one has the following format: '2016-01-01' and the other '2016/1/1'
# to have a uniform format, we convert the both Date columns to datetime objects
price_df['Date'] = pd.to_datetime(price_df['Date'])
transaction_df['Date'] = pd.to_datetime(transaction_df['Date'])
joined_df = transaction_df.merge(price_df, how = 'left', on = 'Date')
#joined_df = validation_df(joined_df)
return joined_df
Output:
Date total_transaction_count Volume gas_consumption \
0 2016-01-01 2665 NaN NaN
1 2016-01-02 4217 NaN NaN
2 2016-01-03 4396 NaN NaN
3 2016-01-04 4776 NaN NaN
4 2016-01-05 26649 NaN NaN
... ... ... ... ...
2250 2022-02-28 1980533 1.968686e+06 8.626201e+11
2251 2022-03-01 2013145 2.194055e+06 1.112079e+12
2252 2022-03-02 1987934 2.473327e+06 1.167615e+12
2253 2022-03-03 1973190 3.093248e+06 1.260826e+12
2254 2022-03-04 1861286 4.446204e+06 1.045814e+12
old_ave_gas_fee new_avg_gas_fee new_avg_base_fee \
0 0.000000e+00 0.000000e+00 0.000000e+00
1 0.000000e+00 0.000000e+00 0.000000e+00
2 0.000000e+00 0.000000e+00 0.000000e+00
3 0.000000e+00 0.000000e+00 0.000000e+00
4 0.000000e+00 0.000000e+00 0.000000e+00
... ... ... ...
2250 6.356288e-08 6.356288e-08 5.941877e-08
2251 5.368574e-08 5.368574e-08 4.982823e-08
2252 5.567472e-08 5.567472e-08 4.782055e-08
2253 4.763823e-08 4.763823e-08 4.140883e-08
2254 4.566440e-08 4.566440e-08 3.547666e-08
new_avg_priority_fee Open.Bitcoin Open.Ethereum ... High.Golem \
0 0.000000e+00 430.0 NaN ... NaN
1 0.000000e+00 434.0 NaN ... NaN
2 0.000000e+00 433.7 NaN ... NaN
3 0.000000e+00 430.7 NaN ... NaN
4 0.000000e+00 433.3 NaN ... NaN
... ... ... ... ... ...
2250 4.144109e-09 37707.2 2616.34 ... 0.48904
2251 3.857517e-09 43187.2 2922.44 ... 0.48222
2252 7.854179e-09 44420.3 2975.80 ... 0.47550
2253 6.229401e-09 NaN NaN ... NaN
2254 1.018774e-08 NaN NaN ... NaN
High.Solana Low.Bitcoin Low.Ethereum Low.Golem Low.Solana \
0 NaN 425.9 NaN NaN NaN
1 NaN 430.7 NaN NaN NaN
2 NaN 423.1 NaN NaN NaN
3 NaN 428.6 NaN NaN NaN
4 NaN 428.9 NaN NaN NaN
... ... ... ... ... ...
2250 NaN 37458.9 2574.12 0.41179 NaN
2251 NaN 42876.6 2858.54 0.45093 NaN
2252 NaN 43361.3 2914.70 0.43135 NaN
2253 NaN NaN NaN NaN NaN
2254 NaN NaN NaN NaN NaN
Close.Bitcoin Close.Ethereum Close.Golem Close.Solana
0 434.0 NaN NaN NaN
1 433.7 NaN NaN NaN
2 430.7 NaN NaN NaN
3 433.3 NaN NaN NaN
4 431.2 NaN NaN NaN
... ... ... ... ...
2250 43188.2 2922.50 0.47748 NaN
2251 44420.3 2975.81 0.47447 NaN
2252 43853.2 2952.47 0.43964 NaN
2253 NaN NaN NaN NaN
2254 NaN NaN NaN NaN
[2255 rows x 24 columns]

Further table url link scraping

Using this awesome code I can scrape the majority of the data, however it seems like some of the url's may not scrape. Also I need to be able to identify the data source in the future. Therefore I have two question:
how to program in a check that all links are identified and scraped?
how to insert the url link as a new column so each row is identified?
url = 'https://www.rootsandrain.com/organiser21/uci/events/filters/dh/'
response = requests.get(url)
soup = BeautifulSoup(response.content)
urlList = ['https://www.rootsandrain.com'+row.a['href'] for row in soup.select('#T1 tbody tr')]
data = []
for url in urlList:
try:
data.append(pd.read_html(url)[0])
except:
print(f'No tables found:{url}')
pd.concat(data)
Original post
Scraping URL links in a table

A small adjustment to the try/except block, that writes the url and also the status of scraping:
try:
df = pd.read_html(url)[0]
df['url'] = url
df['scraped'] = True
data.extend(df.to_dict(orient = 'records'))
except:
print(f'No tables found:{url}')
data.append(dict(url=url, scraped=False))
Example
Limited to for url in urlList[5:10] to not iterate all of the links in this example, just kick the [5:10] to get all:
import requests
from bs4 import BeautifulSoup
import pandas as pd
url = 'https://www.rootsandrain.com/organiser21/uci/events/filters/dh/'
response = requests.get(url)
soup = BeautifulSoup(response.content)
urlList = ['https://www.rootsandrain.com'+row.a['href'] for row in soup.select('#T1 tbody tr')]
data = []
for url in urlList[5:10]:
try:
df = pd.read_html(url)[0]
df['url'] = url
df['scraped'] = True
data.extend(df.to_dict(orient = 'records'))
except:
print(f'No tables found:{url}')
data.append(dict(url=url, scraped=False))
pd.DataFrame(data)
#to get rid of all these NaN columns
#pd.DataFrame(data).dropna(axis=1, how='all')
Output
url
scraped
Pos⇧
Bib
Name
Licence
YoB
Sponsors
km/h
sector1 +
sector2 +
sector3 +
sector4 +
sector5 =
Qualifier
km/h.1
sector1 +.1
sector2 +.1
sector3 +.1
sector4 +.1
sector5 =.1
Run 1
Diff
https://www.rootsandrain.com/event9599/2022-jul-9-mercedes-benz-uci-world-cup-dh-4-lenzerheide/
False
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
https://www.rootsandrain.com/event9598/2022-jun-11-mercedes-benz-uci-world-cup-dh-3-leogang/
False
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
https://www.rootsandrain.com/event9597/2022-may-22-mercedes-benz-uci-world-cup-dh-2-fort-william/
False
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
https://www.rootsandrain.com/event9607/2022-apr-23-dhi-masters-world-championships-22-villa-la-angostura/
False
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
https://www.rootsandrain.com/event9596/2022-mar-27-mercedes-benz-uci-world-cup-dh-1-lourdes/
True
1st
6
Amaury PIERRON
1.00088e+10
1996
COMMENCAL - MUC-OFF BY RIDING
62.211
45.680s5
32.179s3
36.880s2
27.636s5
28.399s8
2:50.7743
60.3411
44.269s1
33.091s23
36.159s1
26.774s2
27.418s1
2:47.7111
-
https://www.rootsandrain.com/event9596/2022-mar-27-mercedes-benz-uci-world-cup-dh-1-lourdes/
True
2nd
14
Finn ILES
1.00909e+10
1999
Specialized Gravity
60.072
44.915s3
32.242s5
37.407s5
27.636s5
28.254s3
2:50.4542
59.0821
44.924s2
31.474s3
37.175s4
26.781s3
28.204s6
2:48.5582
0.847s
https://www.rootsandrain.com/event9596/2022-mar-27-mercedes-benz-uci-world-cup-dh-1-lourdes/
True
3rd
1
Loïc BRUNI
1.00075e+10
1994
Specialized Gravity
59.6027
45.080s4
31.429s1
37.254s4
27.164s2
28.344s6
2:49.2711
58.8225
45.023s4
31.653s6
37.112s3
27.172s5
27.837s3
2:48.7973
1.086s
https://www.rootsandrain.com/event9596/2022-mar-27-mercedes-benz-uci-world-cup-dh-1-lourdes/
True
4th
8
Benoit COULANGES
1.00082e+10
1994
Dorval AM Commencal
60.671
46.785s15
31.773s2
36.688s1
27.496s3
28.348s7
2:51.0905
59.0821
45.302s7
31.591s5
37.745s9
26.711s1
27.828s2
2:49.1774
1.466s
https://www.rootsandrain.com/event9596/2022-mar-27-mercedes-benz-uci-world-cup-dh-1-lourdes/
True
5th
17
Luca SHAW
1.00088e+10
1996
Canyon Collective Factory Team
59.0836
46.267s7
32.952s14
38.067s10
28.504s17
28.453s9
2:54.24310
61.223
45.223s6
31.572s4
37.436s7
27.629s12
28.282s10
2:50.1425
2.431s
https://www.rootsandrain.com/event9596/2022-mar-27-mercedes-benz-uci-world-cup-dh-1-lourdes/
True
6th
9
Danny HART (elt)
1.00055e+10
1991
CUBE FACTORY RACING
59.8722
47.041s22
32.715s8
37.246s3
28.000s11
28.301s4
2:53.3037
60.0714
46.511s21
31.868s7
36.600s2
27.391s10
28.036s4
2:50.4066
2.695s

How can I correctly write a function? (Python)

Here is my definition:
def fill(df_name):
"""
Function to fill rows and dates.
"""
# Fill Down
for row in df_name[0]:
if 'Unnamed' in row:
df_name[0] = df_name[0].replace(row, np.nan)
df_name[0] = df_name[0].ffill(limit=2)
df_name[1] = df_name[1].ffill(limit=2)
# Fill in Dates
for col in df_name.columns:
if col >= 3:
old_dt = datetime(1998, 11, 15)
add_dt = old_dt + relativedelta(months=col - 3)
new_dt = add_dt.strftime('%#m/%d/%Y')
df_name = df_name.rename(columns={col: new_dt})
and then I call:
fill(df_cars)
The first half of the formula works (columns 0 and 1 have filled in correctly). However, as you can see, the columns are labeled 0-288. When I delete this function and simply run the code (changing df_name to df_cars) it runs correctly and the column names are the dates specified in the second half of the function.
What could be causing this to not execute the # Fill in Dates portion when defined in a function? Does it have to do with local variables?
0 1 2 3 4 5 ... 287 288 289 290 291 292
0 France NaN Market 3330 7478 2273 ... NaN NaN NaN NaN NaN NaT
1 France NaN World 362 798 306 ... NaN NaN NaN NaN NaN NaT
2 France NaN % 0.108709 0.106713 0.134624 ... NaN NaN NaN NaN NaN NaT
3 Germany NaN Market 1452 2025 1314 ... NaN NaN NaN NaN NaN NaT
4 Germany NaN World 209 246 182 ... NaN NaN NaN NaN NaN NaT
.. ... ... ... ... ... ... ... ... ... ... ... ... ..
349 Slovakia 0 World 1 1 0 ... NaN NaN NaN NaN NaN NaT
350 Slovakia 0 % 0.5 0.5 0 ... NaN NaN NaN NaN NaN NaT

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Adding new column to DataFrame using a list - python

Related

Beautifulsoup: Scrape Table with Key Word Search

pandas.read_html tables not found

AssertionError when use df.loc in python

Further table url link scraping

How can I correctly write a function? (Python)

Categories

Resources