I am trying yo gave the download results to a dataframe download portion works but at the end the dataframe is blank
df = pd.DataFrame()
url = 'https://www.cms.gov/files/zip/monthly-contract-summary-report-april-2020.zip'
FolderYear = '2020'
FolderName = 'ContractSummary'
FileName = 'monthly-contract-summary.zip'
FileDirectory = rootpath+FolderYear+"/"+FolderName+"/"
FullWritePath = rootpath+FolderYear+"/"+FolderName+"/"+FileName
if not os.path.exists(FileDirectory):
os.makedirs(FileDirectory)
r = requests.get(url)
with open(FullWritePath, 'wb') as f:
f.write(r.content)
# Retrieve HTTP meta-data
print(r.status_code)
print(r.headers['content-type'])
print(r.encoding)
df['Status'] = r.status_code
df['headers'] = r.status_code
df['FileName'] = FileName
df['FullWritePath'] = FullWritePath
df['ZipFileDowlondLink'] = url
Do this instead of your last few lines:
rows = []
columns = ['Status', 'headers', 'FileName', 'FullWritePath', 'ZipFileDowlondLink']
rows.append([r.status_code, r.status_code, FileName, FullWritePath, url])
df = pd.DataFrame(rows, columns=columns)
print(df)
Status headers FileName FullWritePath ZipFileDowlondLink
0 200 200 monthly-contract-summary.zip .2020/ContractSummary/monthly-contract-summary... https://www.cms.gov/files/zip/monthly-contract...
Related
i am writing a script to download images.
I'm reading a excel file as a pandas dataframe
Column A -url links
Column B - Name
downloaded images will have this name, example "A.jpeg"
There will be duplicates in Column B[Name] in that case i would like to add a suffix on the image name.
so the output will be
A.jpeg
A-1.Jpeg
..
import requests
import pandas as pd
df = pd.read_excel(r'C:\Users\exdata1.xlsx')
for index, row in df.iterrows():
url = row['url']
file_name = url.split('/')
r = requests.get(url)
file_name=(row['name']+".jpeg")
if r.status_code == 200:
with open(file_name, "wb") as f:
f.write(r.content)
print (file_name)
I have been trying cumcount but can't really seem to get it to work..
Apreciate all the help I can get
You can try:
import requests
import pandas as pd
df = pd.read_excel(r"C:\Users\exdata1.xlsx")
cnt = {}
for index, row in df.iterrows():
name = row["name"]
if name not in cnt:
cnt[name] = 0
name = f"{name}.jpeg"
else:
cnt[name] += 1
name = f"{name}-{cnt[name]}.jpeg"
url = row["url"]
r = requests.get(url)
if r.status_code == 200:
with open(name, "wb") as f:
f.write(r.content)
print(name)
This will download the files as A.jpeg, A-1.jpeg, A-2.jpeg, ...
So here is a sample of my excel layout:
But after merging it has two header and loses the layout.
Here is my code:
import pandas as pd
import glob
path = r"C:/Users//"
fname = glob.glob(path + "/*.xlsx")
result_DFs1 = pd.DataFrame()
result_DFs2 = pd.DataFrame()
for i in fname:
try:
df1 = pd.read_excel(i,sheet_name = "Test1")
result_DFs1 = pd.concat([result_DFs1, df1])
except:
pass
for i in fname:
try:
df2 = pd.read_excel(i,sheet_name = "Test2")
result_DFs2 = pd.concat([result_DFs2, df2])
except:
pass
with pd.ExcelWriter('pandas_to_excel.xlsx') as writer:
result_DFs1.to_excel (writer, sheet_name='Test1')
result_DFs2.to_excel (writer, sheet_name='Test2')
Is there a way I can just have one header and without losing the excel layout format?
You can keep track of your sheets and only include headers for the first one. Something like:
first = True
for i in fname:
try:
if first:
df1 = pd.read_excel(i,sheet_name = "Test1", skiprows=0, header=0)
first = False
else:
df1 = pd.read_excel(i,sheet_name = "Test1", skiprows=1, header=None)
result_DFs1 = pd.concat([result_DFs1, df1])
except:
pass
amazing pythoners,
I am hoping to get some help with my below scenario
I have a list of a few centers based on which I want to extract employee data for each center. Earlier I was using the below method and it was working beautifully.
row[0] in the CSV file had the whole URL which looked something like this:
https://api.test.com/v1/centers/96d901bd-2fcc-4f59-91d7-de18f0b0aa90/employees?page=1&size=100
FilePath = open("Employees.csv")
CSV_File = csv.reader(FilePath)
#CSV_File.next()
header = next(CSV_File)
for row in CSV_File:
url2 = row[0]
CenterCode = row[1]
try:
payload={}
payload2={}
headers = {'Authorization': 'apikey'}
response = requests.request("GET", url2, headers=headers, data=payload)
EmployeesData = response.json()
for i in EmployeesData['employees']:
print(i['employee_id'], end = ','), print(CenterCode)
import requests
import pandas as pd
import json
## AA is my DataFrame
AA = AA[['id', 'code']]
#print(AA)
CID = AA['id']
CID2 = CID.to_string(index = False)
#print(CID2)
for index in range(len(AA)):
#print (AA.loc[index, 'id'], AA.loc[index, 'code'])
try:
url2 = f"https://api.test.com/v1/centers/{CID2}/employees?page=1&size=100"
print(url2)
payload={}
files=[]
headers = {'Authorization': 'apikey'}
response = requests.request("GET", url2, headers=headers, data=payload, files=files)
data = response.json()
print('Employee Guid','|','Employee Code', '|', CID2)
except Exception as e:
print(e)
Now I have included the URL in the below new code and replaced only "Center ID" by using the F string. I am extracting Center ID from Pandas DataFrame. However, when I run the code, I am getting the error "Expecting value: line 1 column 1 (char 0)" and I guessed that it must be due to the URL, hence I tried to print the URL and found below result.
Output:-
https://api.zenoti.com/v1/centers/ee2395cb-e714-41df-98d2-66a69d38c556
96d901bd-2fcc-4f59-91d7-de18f0b0aa90/employees?page=1&size=100
Expecting value: line 1 column 1 (char 0)
https://api.zenoti.com/v1/centers/ee2395cb-e714-41df-98d2-66a69d38c556
96d901bd-2fcc-4f59-91d7-de18f0b0aa90/employees?page=1&size=100
Expecting value: line 1 column 1 (char 0)
[Finished in 4.6s]
What is happening in the above output is that I have 2 rows to test my code each containing a unique Center ID, however, both of them are getting added together and replaced by the F string in the URL hence the error 😓
Any suggestion, what could be done differently here?
Thanks in advance.
If I understand correctly, the Center ID is this: CID = AA['id']
Try iterating through the id column this way:
for CID2 in AA['id']:
try:
url2 = f"https://api.test.com/v1/centers/{CID2}/employees?page=1&size=100"
print(url2)
except Exception as e:
print(e)
I could use some help. I am stuck. I want to read a .csv ("IMDb_id.csv"), it has one column (Imdb_link) that contains random IMDb tile URLs. I want it to iterate through the rows grab the URL and then scrape the URL for the title, image, genre, actor and director. I then want to download the image, save it with the name of the IMDb title and put the actor and director in one.csv. This code right now works and brings down one poster, puts the actor and director in its own .csv but doesn't iterate through the .csv or combine into one .csv.
print("Current Working Directory " , os.getcwd())
os.chdir('/Users/Desktop/PROJECT MOVIE ANALITICA')
df_IMDb_id_URL = pd.read_csv("IMDb_id.csv")
#print(df_IMDb_id_URL.head(3))
#df_IMDb_id =[]
for column,row in df_IMDb_id_URL.iteritems():
#print(index)
Movie_URL = row
print(Movie_URL)
r = requests.get('https://www.imdb.com/title/tt0037800')
r_unparsed = r.text
start = time.time()
Movie_IMDb_Page = BeautifulSoup(r_unparsed,'lxml')
end = time.time()
for index in Movie_IMDb_Page.find_all("script",type="application/ld+json"):
result_dictionary = json.loads(index.contents[0])
Image_URL = result_dictionary['image']
Movie_ID = result_dictionary['url']
Image_ID_Name = re.sub('/|title', '', Movie_ID)
title = Movie_IMDb_Page.title.text
description = Movie_IMDb_Page.find('div','summary_text').text.strip()
url_response = urllib.request.urlopen(Image_URL)
print(url_response)
# Python Requests Tutorial- Request Web Pages, Download Images, POST Data, Read JSON, and More.mp4
print(Image_URL) #Print URL
Image_URL_request = requests.get(Image_URL)
try:
os.mkdir(os.path.join(os.getcwd(), "POSTER"))
except:
pass
os.chdir(os.path.join(os.getcwd(), "POSTER"))
with open(Image_ID_Name + '.jpg', 'wb') as f:
f.write(Image_URL_request.content)
Actors_List =[]
Directors_List =[]
Creators_List =[]
Genre_List =[]
Movie_ID = result_dictionary['url']
Actors = result_dictionary['actor']
Directors = result_dictionary['director']
Creators = result_dictionary['creator']
Genres = result_dictionary['genre']
print(re.sub('/|title', '', Movie_ID))
print (Movie_ID)
#print (res_str_ID)
#print (Actors)
for index in Actors:
Actors_List.append(str(index[u'name']))
for index in Directors:
Directors_List.append(str(index[u'name']))
#for index in Creators:
# Creators_List.append(str(index[u'name']))
#Method PANDAS
df = pd.DataFrame(Actors_List)
#df.to_csv('Actors_List.csv')
df.to_csv('Actors_List.csv', index=False, header=False) # removes the headers from csv and saves
#Method PANDAS
df = pd.DataFrame(Directors_List)
#df.to_csv('Actors_List.csv')
df.to_csv('Directors_List.csv', index=False, header=False) # removes the headers from csv and saves
print(result_dictionary['contentRating'])
print(Genres)
print(Actors_List)
print(Directors_List)
print(Creators)
I am trying to pass a column value as url parameter in loop and I am also trying to put result into json file and later convert into excel. I need to do this without creating any function.
import pandas as pd
import requests
import json
from pandas.io.json import json_normalize
df = pd.read_excel('C:/Users/one.xlsx',converters={'name':str})
df = df['name']
df.head()
dd=df.values
user=b"user"
passwd=b'pass'
auth_values = (user, passwd)
for i in dd:
ur='http://xyz.co&name='
url =ur + str(dd)
response = (requests.get(url, auth=auth_values).text)
response_json = json.loads(response)
response_json=json.dumps(response,ensure_ascii=True)
writeFile =open('C:/Users/file1.json', 'w')
writeFile.write(response_json)
writeFile.close()
print(url)
you can try this.
import pandas as pd
import requests
import json
from pandas.io.json import json_normalize
df = pd.read_excel('C:/Users/one.xlsx',converters={'name':str})
df = df['name']
df.head()
dd=df.values
user=b"user"
passwd=b'pass'
auth_values = (user, passwd)
with open('C:/Users/file1.json', 'w') as writeFile:
for i in dd:
ur='http://xyz.co&name='
url =ur + str(i)
response = requests.get(url, auth=auth_values).text
response_json = json.loads(response)
response_json=json.dumps(response,ensure_ascii=True)
writeFile.write(response_json)
print(url)
For export to excel:
df = pd.read_excel('C:/Users/one.xlsx',converters={'name':str})
df = df['name']
dd=df.values
user=b"user"
passwd=b'pass'
auth_values = (user, passwd)
df_base = None
for i in dd:
ur='http://xyz.co&name='
url =ur + str(i)
response = requests.get(url, auth=auth_values).text
df = pd.read_json(response)
if df_base is None:
df_base = df
else:
df_base.append(df)
print(url)
df_base.to_excel("C:/Users/output.xlsx")