I am trying to pass a column value as url parameter in loop and I am also trying to put result into json file and later convert into excel. I need to do this without creating any function.
import pandas as pd
import requests
import json
from pandas.io.json import json_normalize
df = pd.read_excel('C:/Users/one.xlsx',converters={'name':str})
df = df['name']
df.head()
dd=df.values
user=b"user"
passwd=b'pass'
auth_values = (user, passwd)
for i in dd:
ur='http://xyz.co&name='
url =ur + str(dd)
response = (requests.get(url, auth=auth_values).text)
response_json = json.loads(response)
response_json=json.dumps(response,ensure_ascii=True)
writeFile =open('C:/Users/file1.json', 'w')
writeFile.write(response_json)
writeFile.close()
print(url)
you can try this.
import pandas as pd
import requests
import json
from pandas.io.json import json_normalize
df = pd.read_excel('C:/Users/one.xlsx',converters={'name':str})
df = df['name']
df.head()
dd=df.values
user=b"user"
passwd=b'pass'
auth_values = (user, passwd)
with open('C:/Users/file1.json', 'w') as writeFile:
for i in dd:
ur='http://xyz.co&name='
url =ur + str(i)
response = requests.get(url, auth=auth_values).text
response_json = json.loads(response)
response_json=json.dumps(response,ensure_ascii=True)
writeFile.write(response_json)
print(url)
For export to excel:
df = pd.read_excel('C:/Users/one.xlsx',converters={'name':str})
df = df['name']
dd=df.values
user=b"user"
passwd=b'pass'
auth_values = (user, passwd)
df_base = None
for i in dd:
ur='http://xyz.co&name='
url =ur + str(i)
response = requests.get(url, auth=auth_values).text
df = pd.read_json(response)
if df_base is None:
df_base = df
else:
df_base.append(df)
print(url)
df_base.to_excel("C:/Users/output.xlsx")
Related
i am writing a script to download images.
I'm reading a excel file as a pandas dataframe
Column A -url links
Column B - Name
downloaded images will have this name, example "A.jpeg"
There will be duplicates in Column B[Name] in that case i would like to add a suffix on the image name.
so the output will be
A.jpeg
A-1.Jpeg
..
import requests
import pandas as pd
df = pd.read_excel(r'C:\Users\exdata1.xlsx')
for index, row in df.iterrows():
url = row['url']
file_name = url.split('/')
r = requests.get(url)
file_name=(row['name']+".jpeg")
if r.status_code == 200:
with open(file_name, "wb") as f:
f.write(r.content)
print (file_name)
I have been trying cumcount but can't really seem to get it to work..
Apreciate all the help I can get
You can try:
import requests
import pandas as pd
df = pd.read_excel(r"C:\Users\exdata1.xlsx")
cnt = {}
for index, row in df.iterrows():
name = row["name"]
if name not in cnt:
cnt[name] = 0
name = f"{name}.jpeg"
else:
cnt[name] += 1
name = f"{name}-{cnt[name]}.jpeg"
url = row["url"]
r = requests.get(url)
if r.status_code == 200:
with open(name, "wb") as f:
f.write(r.content)
print(name)
This will download the files as A.jpeg, A-1.jpeg, A-2.jpeg, ...
I am trying yo gave the download results to a dataframe download portion works but at the end the dataframe is blank
df = pd.DataFrame()
url = 'https://www.cms.gov/files/zip/monthly-contract-summary-report-april-2020.zip'
FolderYear = '2020'
FolderName = 'ContractSummary'
FileName = 'monthly-contract-summary.zip'
FileDirectory = rootpath+FolderYear+"/"+FolderName+"/"
FullWritePath = rootpath+FolderYear+"/"+FolderName+"/"+FileName
if not os.path.exists(FileDirectory):
os.makedirs(FileDirectory)
r = requests.get(url)
with open(FullWritePath, 'wb') as f:
f.write(r.content)
# Retrieve HTTP meta-data
print(r.status_code)
print(r.headers['content-type'])
print(r.encoding)
df['Status'] = r.status_code
df['headers'] = r.status_code
df['FileName'] = FileName
df['FullWritePath'] = FullWritePath
df['ZipFileDowlondLink'] = url
Do this instead of your last few lines:
rows = []
columns = ['Status', 'headers', 'FileName', 'FullWritePath', 'ZipFileDowlondLink']
rows.append([r.status_code, r.status_code, FileName, FullWritePath, url])
df = pd.DataFrame(rows, columns=columns)
print(df)
Status headers FileName FullWritePath ZipFileDowlondLink
0 200 200 monthly-contract-summary.zip .2020/ContractSummary/monthly-contract-summary... https://www.cms.gov/files/zip/monthly-contract...
import pandas as pd
import requests as rq
from sqlalchemy import create_engine
engine = create_engine('postgresql+psycopg2://postgres:3434#127.0.0.1/postgres')
temp = pd.DataFrame()
df = pd.DataFrame()
vehicleList = {"LX59ANR", "SN63NBK", "YY64GRU"}
for ids in vehicleList:
r = rq.get('https://api.tfl.gov.uk/Vehicle/' + ids + '/Arrivals')
r = r.text
temp = pd.read_json(r)
temp['Type'] = ids
df = pd.concat([df, temp], sort=False).reset_index(drop=True)
df.head(0).to_sql('tfl_bus', engine, if_exists='replace', index=False) # truncates the table
Hello. cannot save data from pandas(dataframe) to postgresql. only column names occurred.
I removed head(0) result like this
This work , I added this line : df['timing'] = list(map(lambda x: json.dumps(x), df['timing']))
import sqlalchemy as sa
import psycopg2
import requests as rq
import pandas as pd
import json
r = rq.get('https://api.tfl.gov.uk/Vehicle/SN63NBK/Arrivals')
temp = pd.DataFrame()
df = pd.DataFrame()
r = r.text
temp = pd.read_json(r)
temp['Type'] = '1'
df = pd.concat([df, temp], sort=False).reset_index(drop=True)
engine=sa.create_engine('postgresql+psycopg2://postgres:3434#127.0.0.1/postgres')
df['timing'] = list(map(lambda x: json.dumps(x), df['timing']))
df.to_sql('tfl_bus2', engine, if_exists='replace', index=False)
df.head(0) needs to be replaced with just df.
The head(0) strips away the actual data leaving the columns...
I am scraping multiple tables from the web that are exactly like this one (the big batting gamelogs table) and I need the dataframe to ignore the inner header rows that start with the month of the season.
Here is my script so far:
from bs4 import BeautifulSoup
import pandas as pd
import csv
import urllib2
def stir_the_soup():
player_links = open('player_links.txt', 'r')
player_ID_nums = open('player_ID_nums.txt', 'r')
id_nums = [x.rstrip('\n') for x in player_ID_nums]
idx = 0
for url in player_links:
#open the url and create bs object
player_link = urllib2.urlopen(url)
bs = BeautifulSoup(player_link, 'html5lib')
#identify which table is needed
table_id = ""
if url[-12] == 'b':
table_id = "batting"
elif url[-12] == 'p':
table_id = "pitching"
#find the table and create dataframe
table = str(bs.find('table', {'id' : (table_id + '_gamelogs')}))
df = pd.read_html(table, header=0)
df2 = df[0]
df2 = df2[df2.PA != 'PA']
#for the name of the file and file path
file_path = '/Users/kramerbaseball/Desktop/MLB_Web_Scraping_Program/game_logs_non_concussed/'
name_of_file = str(id_nums[idx])
df2.to_csv(path_or_buf=(file_path + name_of_file + '.csv'), sep=',', encoding='utf-8')
idx += 1
if __name__ == "__main__":
stir_the_soup()
I tried taking the dataframe and ignoring the rows where PA == PA or HR == HR but it will not delete the rows. Any help is appreciated
Notice that in some inner headers columns values are constant. This will drop intermediate headers from your df:
df3 = df2[df2['Gtm']!='Date']
I'm trying to find out if there is an easier way to append a Date column and an additional info column to my existing csv file. I'm adding these columns because this information is not in the JSON string from the REST API call.
import requests
import json
import http.client
import datetime
import pandas as pd
from pandas.io.json import json_normalize
url = api.getinfo()
r = requests.get(url, headers=headers, verify=False)
if r.status_code != http.client.OK:
raise requests.HTTPError(r)
jsonstring = json.dumps(r.json()["data"])
load = json.loads(jsonstring)
df = json_normalize(load)
col = ["poolId", "totalPoolCapacity", "totalLocatedCapacity",
"availableVolumeCapacity", "usedCapacityRate"]
with open('hss.csv', 'a') as f:
df.to_csv(f, header=False, columns=col)
a = pd.read_csv('hss.csv')
a['date'] = [datetime.date.today()] * len(a)
a.to_csv('hss.csv')
b = pd.read_csv('hss.csv')
b['storage system'] = "ssystem22"
b.to_csv('hss.csv')
I end up getting extra columns Unnamed: 0,Unnamed: 0.1 in my csv file each time the script is run. Also each time I append it overwrites the old dates.
,Unnamed: 0,Unnamed: 0.1,poolId,totalPoolCapacity, totalLocatedCapacity,availableVolumeCapacity,usedCapacityRate,date,storage system
0,155472,223618,565064,51,,2017-04-12,ssystem22
1,943174,819098,262042,58,,2017-04-12,ssystem22
0,764600,966017,046668,71,,2017-04-12,ssystem22
1,764600,335680,487650,76,,2017-04-12,ssystem22
2,373700,459800,304446,67,,2017-04-12,ssystem22
I kept researching and found how to fix this. I should have been using the pd.Series function. Below is the corrected code:
import requests
import json
import http.client
import datetime
import pandas as pd
from pandas.io.json import json_normalize
url = api.getinfo()
r = requests.get(url, headers=headers, verify=False)
if r.status_code != http.client.OK:
raise requests.HTTPError(r)
jsonstring = json.dumps(r.json()["data"])
load = json.loads(jsonstring)
df = json_normalize(load)
df['storage system'] = pd.Series('ssystem22', index=df.index)
df['date'] = pd.Series(datetime.date.today().strftime('%m-%d-%Y'),
index=df.index)
col = ["poolId", "totalPoolCapacity", "totalLocatedCapacity",
"availableVolumeCapacity", "usedCapacityRate", "storage system",
"date"]
with open(csvfile, 'a') as f:
df.to_csv(f, header=False, columns=col)