I am scraping multiple tables from the web that are exactly like this one (the big batting gamelogs table) and I need the dataframe to ignore the inner header rows that start with the month of the season.
Here is my script so far:
from bs4 import BeautifulSoup
import pandas as pd
import csv
import urllib2
def stir_the_soup():
player_links = open('player_links.txt', 'r')
player_ID_nums = open('player_ID_nums.txt', 'r')
id_nums = [x.rstrip('\n') for x in player_ID_nums]
idx = 0
for url in player_links:
#open the url and create bs object
player_link = urllib2.urlopen(url)
bs = BeautifulSoup(player_link, 'html5lib')
#identify which table is needed
table_id = ""
if url[-12] == 'b':
table_id = "batting"
elif url[-12] == 'p':
table_id = "pitching"
#find the table and create dataframe
table = str(bs.find('table', {'id' : (table_id + '_gamelogs')}))
df = pd.read_html(table, header=0)
df2 = df[0]
df2 = df2[df2.PA != 'PA']
#for the name of the file and file path
file_path = '/Users/kramerbaseball/Desktop/MLB_Web_Scraping_Program/game_logs_non_concussed/'
name_of_file = str(id_nums[idx])
df2.to_csv(path_or_buf=(file_path + name_of_file + '.csv'), sep=',', encoding='utf-8')
idx += 1
if __name__ == "__main__":
stir_the_soup()
I tried taking the dataframe and ignoring the rows where PA == PA or HR == HR but it will not delete the rows. Any help is appreciated
Notice that in some inner headers columns values are constant. This will drop intermediate headers from your df:
df3 = df2[df2['Gtm']!='Date']
Related
i am writing a script to download images.
I'm reading a excel file as a pandas dataframe
Column A -url links
Column B - Name
downloaded images will have this name, example "A.jpeg"
There will be duplicates in Column B[Name] in that case i would like to add a suffix on the image name.
so the output will be
A.jpeg
A-1.Jpeg
..
import requests
import pandas as pd
df = pd.read_excel(r'C:\Users\exdata1.xlsx')
for index, row in df.iterrows():
url = row['url']
file_name = url.split('/')
r = requests.get(url)
file_name=(row['name']+".jpeg")
if r.status_code == 200:
with open(file_name, "wb") as f:
f.write(r.content)
print (file_name)
I have been trying cumcount but can't really seem to get it to work..
Apreciate all the help I can get
You can try:
import requests
import pandas as pd
df = pd.read_excel(r"C:\Users\exdata1.xlsx")
cnt = {}
for index, row in df.iterrows():
name = row["name"]
if name not in cnt:
cnt[name] = 0
name = f"{name}.jpeg"
else:
cnt[name] += 1
name = f"{name}-{cnt[name]}.jpeg"
url = row["url"]
r = requests.get(url)
if r.status_code == 200:
with open(name, "wb") as f:
f.write(r.content)
print(name)
This will download the files as A.jpeg, A-1.jpeg, A-2.jpeg, ...
I'm trying this code on a folder with bunch of 'txt' files
import pandas as pd
import os
for review in os.listdir('ebert_reviews'):
with open(os.path.join('ebert_reviews',review),encoding='utf-8') as file:
list_dir=[]
title = file.readline()[:-1]
url = file.readline()[:-1]
review_text = file.read()
list_dir.append({'title':title,
'url':url,
'review_text':review_text})
#print(list_dir)
df = pd.DataFrame(list_dir)
print(df)
the dataframe only saves one of the rows and it's not even the last one, how can I add every entry to it?!
This should work
import pandas as pd
import os
list_dir=[]
for review in os.listdir('ebert_reviews'):
with open(os.path.join('ebert_reviews',review),encoding='utf-8') as file:
title = file.readline()[:-1]
url = file.readline()[:-1]
review_text = file.read()
list_dir.append({'title':title,
'url':url,
'review_text':review_text})
#print(list_dir)
df = pd.DataFrame(list_dir)
print(df)
i am having issues get a nested for loop to output individual csv files for an API call. The API call is paginated, so we have to query the API multiple times and append the data Also have to loop through for every exchange.
The way the code is now it's only outputting the last page of data for a couple of exchanges and the the following exchanges just have 'name' in the CSV, no other data...
from pycoingecko import CoinGeckoAPI
cg = CoinGeckoAPI()
import pandas as pd
import time
##grab a list of all the exchangeslisted on CG
ex_list = cg.get_exchanges_list()
#normalise the json
df = pd.json_normalize(ex_list)
#output to csv
#df.to_csv('exchange_list.csv', encoding='utf-8', index=False)
#make a list with just one column
id_list = df['id'].to_list()
def read_exchange_tickers():
for x in id_list:
for i in range(1,10):
appended_data = []
data = cg.get_exchanges_tickers_by_id(x, page = str(i))
appended_data.append(data)
#time.sleep(10)
#define path + filename
path = 'ticker_lists/'
filename = path + x + '_' + '.csv'
appended_data = pd.json_normalize(appended_data, record_path=['tickers'], meta=['name'])
appended_data.to_csv(filename, encoding='utf-8', index=False)
time.sleep(10)
read_exchange_tickers()
You should collect all data for each id and then save the data to file.
def read_exchange_tickers():
for x in id_list:
appended_data = []
# collect all the data for current id
for i in range(1,10):
data = cg.get_exchanges_tickers_by_id(x, page = str(i))
appended_data.append(data)
# save the data to csv
path = 'ticker_lists/'
filename = path + x + '_' + '.csv'
appended_data = pd.json_normalize(appended_data, record_path=['tickers'], meta=['name'])
appended_data.to_csv(filename, encoding='utf-8', index=False)
time.sleep(10)
i am looking to loop through the coingecko api for all of the exchanges listed on there, pull the tickers that are listed for each exchange [this is paginated to 100 rows], loop through all of the pages. There is no way of telling how many. Then store all rows out to a CSV.
Here is what I have came up with so far.
from pycoingecko import CoinGeckoAPI
cg = CoinGeckoAPI()
import pandas as pd
#grab a list of all the exchangeslisted on CG
ex_list = cg.get_exchanges_list()
#df_ex_list = pd.read_json(exchanges_list)
df = pd.json_normalize(ex_list)
#output to csv
df.to_csv('exchange_list.csv', encoding='utf-8', index=False)
id_list = df['id'].tolist()
def get_ex_tickers():
for x in id_list:
# get tickers
d = cg.get_exchanges_tickers_by_id(x, page_integer = 2)###the num of page integers is not know
#import into pandas df
df = pd.json_normalize(d, record_path=['tickers'], meta=['name'])
#define path + filename
path = 'ticker_lists/'
filename = path+ x +'_ticker_list' + '.csv'
#output to csv
df.to_csv(filename, encoding='utf-8', index=False)
get_ex_tickers()
import pandas as pd
import requests as rq
from sqlalchemy import create_engine
engine = create_engine('postgresql+psycopg2://postgres:3434#127.0.0.1/postgres')
temp = pd.DataFrame()
df = pd.DataFrame()
vehicleList = {"LX59ANR", "SN63NBK", "YY64GRU"}
for ids in vehicleList:
r = rq.get('https://api.tfl.gov.uk/Vehicle/' + ids + '/Arrivals')
r = r.text
temp = pd.read_json(r)
temp['Type'] = ids
df = pd.concat([df, temp], sort=False).reset_index(drop=True)
df.head(0).to_sql('tfl_bus', engine, if_exists='replace', index=False) # truncates the table
Hello. cannot save data from pandas(dataframe) to postgresql. only column names occurred.
I removed head(0) result like this
This work , I added this line : df['timing'] = list(map(lambda x: json.dumps(x), df['timing']))
import sqlalchemy as sa
import psycopg2
import requests as rq
import pandas as pd
import json
r = rq.get('https://api.tfl.gov.uk/Vehicle/SN63NBK/Arrivals')
temp = pd.DataFrame()
df = pd.DataFrame()
r = r.text
temp = pd.read_json(r)
temp['Type'] = '1'
df = pd.concat([df, temp], sort=False).reset_index(drop=True)
engine=sa.create_engine('postgresql+psycopg2://postgres:3434#127.0.0.1/postgres')
df['timing'] = list(map(lambda x: json.dumps(x), df['timing']))
df.to_sql('tfl_bus2', engine, if_exists='replace', index=False)
df.head(0) needs to be replaced with just df.
The head(0) strips away the actual data leaving the columns...