Add sufix on duplicates in pandas dataframe Python - python

i am writing a script to download images.
I'm reading a excel file as a pandas dataframe
Column A -url links
Column B - Name
downloaded images will have this name, example "A.jpeg"
There will be duplicates in Column B[Name] in that case i would like to add a suffix on the image name.
so the output will be
A.jpeg
A-1.Jpeg
..
import requests
import pandas as pd
df = pd.read_excel(r'C:\Users\exdata1.xlsx')
for index, row in df.iterrows():
url = row['url']
file_name = url.split('/')
r = requests.get(url)
file_name=(row['name']+".jpeg")
if r.status_code == 200:
with open(file_name, "wb") as f:
f.write(r.content)
print (file_name)
I have been trying cumcount but can't really seem to get it to work..
Apreciate all the help I can get

You can try:
import requests
import pandas as pd
df = pd.read_excel(r"C:\Users\exdata1.xlsx")
cnt = {}
for index, row in df.iterrows():
name = row["name"]
if name not in cnt:
cnt[name] = 0
name = f"{name}.jpeg"
else:
cnt[name] += 1
name = f"{name}-{cnt[name]}.jpeg"
url = row["url"]
r = requests.get(url)
if r.status_code == 200:
with open(name, "wb") as f:
f.write(r.content)
print(name)
This will download the files as A.jpeg, A-1.jpeg, A-2.jpeg, ...

Related

How to iterate list of urls using request.get?

I have such code:
url = "https://www.reformagkh.ru/opendata/export/"
regions = ["150", "101"]
csv_files = []
for region in regions:
result = requests.get(url, params={"t":region})
zf = ZipFile(BytesIO(result.content))
for filename in zf.namelist():
if filename.endswith(".csv"):
file = zf.open(filename)
csv_files.append(file)
if len(csv_files) == 1:
reader = csv.reader(TextIOWrapper(file, 'utf-8'))
for row in reader:
print(row)
else:
print("Error")
I have 2 links, where located some unzip csv files and I should open them and read. The main question is how work with list of urls and open them step by step?
When I am trying to debug and fix it, I have 400 error and problem with loop. Could somebody give me advise how to handle it?
I should open and handle such links:
['https://www.reformagkh.ru/opendata/export/150',
'https://www.reformagkh.ru/opendata/export/101']
You need to prepare the url in the loop instead of passing region as params.
Use f-strings to prepare the url as for Python 3.6+:
for region in regions:
url_cur = f"{url}{region}"
result = requests.get(url_cur)
Use format() if you are using python version less than 3.6:
for region in regions:
url_cur = "{}{}".format(url, region)
result = requests.get(url_cur)
You also need to create the csv_files list newly for each url.
The complete code would be:
url = "https://www.reformagkh.ru/opendata/export/"
regions = ["150", "101"]
for region in regions:
cur_url = f"{url}{region}"
result = requests.get(cur_url)
zf = ZipFile(BytesIO(result.content))
csv_files = [] # create a new list everytime
for filename in zf.namelist():
if filename.endswith(".csv"):
file = zf.open(filename)
csv_files.append(file)
if len(csv_files) == 1:
reader = csv.reader(TextIOWrapper(file, 'utf-8'))
for row in reader:
print(row)
else:
print("Error")
regions = ["150", "101"]
csv_files = []
for region in regions:
url = "https://www.reformagkh.ru/opendata/export/%s" % region
result = requests.get(url)
zf = ZipFile(BytesIO(result.content))
for filename in zf.namelist():
if filename.endswith(".csv"):
file = zf.open(filename)
csv_files.append(file)
if len(csv_files) == 1:
reader = csv.reader(TextIOWrapper(file, 'utf-8'))
for row in reader:
print(row)
else:
print("Error")
I think it is much easier with %s. I often use the same method.

In Python what is the best way to read a pdf table with no outline?

I am trying to read data from a table in a pdf into a pandas dataframe. I am able to do so using tabula-py when the pdf has outlines around the table, but when I try on the pdf without an outline the script produces an error.
For example, I am looking at the pdfs available from two different urls. I have downloaded the pdfs from the urls and saved them as 'JSE Opts.pdf' and 'JSE Divs.pdf' respectively.
import requests
import pandas as pd
url='https://clientportal.jse.co.za/JSE%20Equity%20Derivatives/Dividends/ED_DividendsReport.pdf'
response = requests.get(url)
fname = 'JSE Divs.pdf'
f= open(fname, 'wb')
f.write(response.content)
f.close()
url='https://clientportal.jse.co.za/JSE%20Equity%20Derivatives/Options%20Daily%20Traded%20Report/ED_OptionsDailyTradedReport.pdf'
response = requests.get(url)
fname = 'JSE Opts.pdf'
f= open(fname, 'wb')
f.write(response.content)
f.close()
I am able to read the 'JSE Opts.pdf' into a pandas dataframe using the code:
import tabula as tb
pdf = './JSE Opts.pdf'
data = tb.read_pdf(pdf,pages = 1)
data = data[0]
print(data)
When I try to do the same for 'JSE Divs.pdf', I get errors and tabula-py is only able to read the header:
pdf = './JSE Divs.pdf'
data = tb.read_pdf(pdf,pages = 1)
data = data[0]
print(data)
I suspect that this is because there are no lines around the table. If that is the case, what is the best way to go about reading the data from 'JSE Divs.pdf' into pandas?
I was able to read the data into a string using pdfplumber, save the string as a CSV file (after cleaning the data to suit my needs) and then import into pandas.
import pdfplumber
pdf = pdfplumber.open("./JSE Divs.pdf")
text = ''
i = 0
while True:
try:
text += pdf.pages[i].extract_text() + '\n'
i = i+1
except IndexError:
break
for replace_s in [' DN',' CA1',' ANY',' CSH',' PHY',' QUANTO']:
text = text.replace(replace_s,'')
while True:
try:
idx = text.index('EXO')
replace_s =text[idx-1:idx+8]
text = text.replace(replace_s,'')
except ValueError:
break
cols ='EXPIRY_s,USYM,EXPIRY,EX_DATE,CUM_PV_DIVS,CUM_DIVS,ISIN,INSTR_ID\n'
text = text[text.index('Div\n')+4:]
text = cols + text
text = text.replace(' ',',')
f = open('divs.csv','w')
f.write(text)
f.close()

Store response type to dataframe

I am trying yo gave the download results to a dataframe download portion works but at the end the dataframe is blank
df = pd.DataFrame()
url = 'https://www.cms.gov/files/zip/monthly-contract-summary-report-april-2020.zip'
FolderYear = '2020'
FolderName = 'ContractSummary'
FileName = 'monthly-contract-summary.zip'
FileDirectory = rootpath+FolderYear+"/"+FolderName+"/"
FullWritePath = rootpath+FolderYear+"/"+FolderName+"/"+FileName
if not os.path.exists(FileDirectory):
os.makedirs(FileDirectory)
r = requests.get(url)
with open(FullWritePath, 'wb') as f:
f.write(r.content)
# Retrieve HTTP meta-data
print(r.status_code)
print(r.headers['content-type'])
print(r.encoding)
df['Status'] = r.status_code
df['headers'] = r.status_code
df['FileName'] = FileName
df['FullWritePath'] = FullWritePath
df['ZipFileDowlondLink'] = url
Do this instead of your last few lines:
rows = []
columns = ['Status', 'headers', 'FileName', 'FullWritePath', 'ZipFileDowlondLink']
rows.append([r.status_code, r.status_code, FileName, FullWritePath, url])
df = pd.DataFrame(rows, columns=columns)
print(df)
Status headers FileName FullWritePath ZipFileDowlondLink
0 200 200 monthly-contract-summary.zip .2020/ContractSummary/monthly-contract-summary... https://www.cms.gov/files/zip/monthly-contract...

Passing dataframe column vale as parameter in get url python

I am trying to pass a column value as url parameter in loop and I am also trying to put result into json file and later convert into excel. I need to do this without creating any function.
import pandas as pd
import requests
import json
from pandas.io.json import json_normalize
df = pd.read_excel('C:/Users/one.xlsx',converters={'name':str})
df = df['name']
df.head()
dd=df.values
user=b"user"
passwd=b'pass'
auth_values = (user, passwd)
for i in dd:
ur='http://xyz.co&name='
url =ur + str(dd)
response = (requests.get(url, auth=auth_values).text)
response_json = json.loads(response)
response_json=json.dumps(response,ensure_ascii=True)
writeFile =open('C:/Users/file1.json', 'w')
writeFile.write(response_json)
writeFile.close()
print(url)
you can try this.
import pandas as pd
import requests
import json
from pandas.io.json import json_normalize
df = pd.read_excel('C:/Users/one.xlsx',converters={'name':str})
df = df['name']
df.head()
dd=df.values
user=b"user"
passwd=b'pass'
auth_values = (user, passwd)
with open('C:/Users/file1.json', 'w') as writeFile:
for i in dd:
ur='http://xyz.co&name='
url =ur + str(i)
response = requests.get(url, auth=auth_values).text
response_json = json.loads(response)
response_json=json.dumps(response,ensure_ascii=True)
writeFile.write(response_json)
print(url)
For export to excel:
df = pd.read_excel('C:/Users/one.xlsx',converters={'name':str})
df = df['name']
dd=df.values
user=b"user"
passwd=b'pass'
auth_values = (user, passwd)
df_base = None
for i in dd:
ur='http://xyz.co&name='
url =ur + str(i)
response = requests.get(url, auth=auth_values).text
df = pd.read_json(response)
if df_base is None:
df_base = df
else:
df_base.append(df)
print(url)
df_base.to_excel("C:/Users/output.xlsx")

Ignoring inner header rows in pandas dataframe

I am scraping multiple tables from the web that are exactly like this one (the big batting gamelogs table) and I need the dataframe to ignore the inner header rows that start with the month of the season.
Here is my script so far:
from bs4 import BeautifulSoup
import pandas as pd
import csv
import urllib2
def stir_the_soup():
player_links = open('player_links.txt', 'r')
player_ID_nums = open('player_ID_nums.txt', 'r')
id_nums = [x.rstrip('\n') for x in player_ID_nums]
idx = 0
for url in player_links:
#open the url and create bs object
player_link = urllib2.urlopen(url)
bs = BeautifulSoup(player_link, 'html5lib')
#identify which table is needed
table_id = ""
if url[-12] == 'b':
table_id = "batting"
elif url[-12] == 'p':
table_id = "pitching"
#find the table and create dataframe
table = str(bs.find('table', {'id' : (table_id + '_gamelogs')}))
df = pd.read_html(table, header=0)
df2 = df[0]
df2 = df2[df2.PA != 'PA']
#for the name of the file and file path
file_path = '/Users/kramerbaseball/Desktop/MLB_Web_Scraping_Program/game_logs_non_concussed/'
name_of_file = str(id_nums[idx])
df2.to_csv(path_or_buf=(file_path + name_of_file + '.csv'), sep=',', encoding='utf-8')
idx += 1
if __name__ == "__main__":
stir_the_soup()
I tried taking the dataframe and ignoring the rows where PA == PA or HR == HR but it will not delete the rows. Any help is appreciated
Notice that in some inner headers columns values are constant. This will drop intermediate headers from your df:
df3 = df2[df2['Gtm']!='Date']

Categories