I want to iterate through rows of a dataframe, I have read this datafarame from an Excel file, and it has 3 columns, I tried to get data of each row and convert it to a dict then store it in a list.
Dict format: {"subreddit_group": string, "links/caption": list, "Subreddits/flair": list}
my code:
data_list = []
linkcap_list = []
subredditflair_list = []
grp = ""
data = pd.read_excel("data.xlsx")
df = pd.DataFrame(data, columns=["subreddit_group", "links/caption", "subreddits/flair"])
data_raw = {}
for index, row in df.iterrows():
grp = row["subreddit_group"]
elms = row['links/caption'].split(",,")
for elm in elms:
elm_linkcap = {}
try:
link, cap = elm.split("^")
elm_linkcap = {"link": link, "caption": cap}
linkcap_list.append(elm_linkcap)
except ValueError as e:
elm_linkcap = {"link": elm, "caption": "none"}
linkcap_list.append(elm_linkcap)
elms = row['subreddits/flair'].split(",")
for elm in elms:
elm_subredditflair = {}
try:
subreddit, flair = elm.split("^")
elm_subredditflair = {"subreddit": subreddit, "flair": flair}
subredditflair_list.append(elm_subredditflair)
except ValueError as e:
elm_subredditflair = {"subreddit": elm, "flair": "none"}
subredditflair_list.append(elm_subredditflair)
data_raw = {"group": grp, "links_caps": linkcap_list, "subreddits_flairs": subredditflair_list}
data_list.append(data_raw)
I want to get the value of that column in each row, but every time I iterate through a row I get the value of the whole column I tried adding an index after the column name to access a specific cell, but I get a weird result, like this for example:
row['links/caption'][1]
how to iterate through a dataframe rows and access the value of each column, not the value of the whole column, I hope I was clear enough with my question, Thanks.
edit1:
using tuples.
data_list = []
linkcap_list = []
subredditflair_list = []
grp = ""
data = pd.read_excel("data.xlsx")
df = pd.DataFrame(data, columns=["subreddit_group", "links/caption", "subreddits/flair"])
data_raw = {}
for row in df.itertuples():
print(row)
grp = row[1]
elms_linkscaps = row[2].split(",,")
elms_subredditsflairs = row[3].split(",")
print(elms_subredditsflairs)
print(elms_linkscaps)
exit()
for elm in elms_linkscaps:
elm_linkcap = {}
try:
link, cap = elm.split("^")
elm_linkcap = {"link": link, "caption": cap}
linkcap_list.append(elm_linkcap)
except ValueError as e:
elm_linkcap = {"link": elm, "caption": "none"}
linkcap_list.append(elm_linkcap)
for elm in elms_subredditsflairs:
elm_subredditflair = {}
try:
subreddit, flair = elm.split("^")
elm_subredditflair = {"subreddit": subreddit, "flair": flair}
subredditflair_list.append(elm_subredditflair)
except ValueError as e:
elm_subredditflair = {"subreddit": elm, "flair": "none"}
subredditflair_list.append(elm_subredditflair)
data_raw = {"group": grp, "links_caps": linkcap_list, "subreddits_flairs": subredditflair_list}
data_list.append(data_raw)
Related
This is a typical example of a json reponse from the US Census Geocoder API request for addresses.
When I geocode the addresses using my API call, I collect the payload into a json file. When parsing the json file using the below Python code, it sometimes so happens that the geocodes are getting wrongly associated with the input address, so when I am converting address geographies to a dataframe format, addresses and their geocodes start to mismatch when the response encounters a timeout/ any exception/ random HTML text in the reponse.
How can I modify my python script to map the corresponding geocodes to the input addresses? Any help would be appreciated!
street = []
city = []
ipstate = []
zipcode = []
status = []
geoid = []
centlat = []
centlon = []
block = []
state = []
basename = []
oid = []
intptlat = []
objectid = []
tract = []
centlon = []
blkgrp = []
arealand = []
intptlon = []
county = []
for i in range(len(payload)):
if '<!DOCTYPE html>' in payload[i]:
print(i,'HTML Response')
status.append('HTML response')
geoid.append(np.nan)
centlat.append(np.nan)
block.append(np.nan)
state.append(np.nan)
basename.append(np.nan)
oid.append(np.nan)
intptlat.append(np.nan)
objectid.append(np.nan)
tract.append(np.nan)
centlon.append(np.nan)
blkgrp.append(np.nan)
arealand.append(np.nan)
intptlon.append(np.nan)
county.append(np.nan)
street.append(np.nan)
city.append(np.nan)
ipstate.append(np.nan)
zipcode.append(np.nan)
else:
data = json.loads(payload[i])
inputAddress = data['result']['input']['address']
street.append(inputAddress['street'])
city.append(inputAddress['city'])
ipstate.append(inputAddress['state'])
zipcode.append(inputAddress['zip'])
censusParams = data['result']['addressMatches']
if len(censusParams) == 0:
# print('No Match', i)
status.append('No Match')
geoid.append(np.nan)
centlat.append(np.nan)
block.append(np.nan)
state.append(np.nan)
basename.append(np.nan)
oid.append(np.nan)
intptlat.append(np.nan)
objectid.append(np.nan)
tract.append(np.nan)
centlon.append(np.nan)
blkgrp.append(np.nan)
arealand.append(np.nan)
intptlon.append(np.nan)
county.append(np.nan)
# print(inputAddress['street'], inputAddress['city'], inputAddress['state'], inputAddress['zip'])
else:
# print('Match', i)
status.append('Match')
# print(inputAddress['street'], inputAddress['city'], inputAddress['state'], inputAddress['zip'])
for c in censusParams:
for key, value in c.items():
if key == 'geographies':
censusBlocks = dict_get(value, 'Census Blocks')
params = censusBlocks[0][0]
geoid.append(params['GEOID'])
centlat.append(params['CENTLAT'])
centlon.append(params['CENTLON'])
block.append(params['BLOCK'])
state.append(params['STATE'])
basename.append(params['BASENAME'])
oid.append(params['OID'])
intptlat.append(params['INTPTLAT'])
intptlon.append(params['INTPTLON'])
objectid.append(params['OBJECTID'])
tract.append(params['TRACT'])
blkgrp.append(params['BLKGRP'])
arealand.append(params['AREALAND'])
county.append(params['COUNTY'])
df_columns = ['Match',
'STREET',
'CITY',
'IP_STATE',
'ZIP',
'GEOID',
'CENTLAT',
'CENTLON',
'BLOCK',
'STATE',
'BASENAME',
'OID',
'INTPTLAT',
'INTPTLON',
'OBJECTID',
'TRACT',
'BLKGRP',
'AREALAND',
'COUNTY']
json_df = pd.DataFrame(list(zip(status,
street,
city,
ipstate,
zipcode,
geoid,
centlat,
centlon,
block,
state,
basename,
oid,
intptlat,
intptlon,
objectid,
tract,
blkgrp,
arealand,
county)), columns = df_columns)
I was able to extract the data from url_query url, but additionally, I would like to get the data from the urls_list created based on the query['ids'] column from dataframe. Please see below the current logic:
url = 'https://instancename.some-platform.com/api/now/table/data?display_value=true&'
team = 'query=group_name=123456789'
url_query = url+team
dataframe: query
[ids]
0 aaabbb1cccdddeee4ffggghhhhh5iijj
1 aa1bbb2cccdddeee5ffggghhhhh6iijj
issue_list = []
for issue in query['ids']:
issue_list.append(f'https://instancename.some-platform.com/api/now/table/data?display_value=true&?display_value=true&query=group_name&sys_id={issue}')
response = requests.get(url_query, headers=headers,auth=auth, proxies=proxies)
data = response.json()
def api_response(k):
dct = dict(
event_id= k['number'],
created_time = k[‘created’],
status = k[‘status’],
created_by = k[‘raised_by’],
short_desc = k[‘short_description’],
group = k[‘team’]
)
return dct
raw_data = []
for p in data['result']:
rec = api_response(k)
raw_data.append(rec)
df = pd.DataFrame.from_records(raw_data)
df:
The url_query response extracts what I need, but the key is that I would like to add to the existing one 'df' add the data from the issue_list = []. I don't know how to put the issue_list = [] to the response. I've tried to add issue_list to the response = requests.get(issue_list, headers=headers,auth=auth, proxies=proxies) statement, but I've got invalid schema error.
You can create list of DataFrames with query q instead url_query and last join together by concat:
dfs = []
for issue in query['ids']:
q = f'https://instancename.some-platform.com/api/now/table/data?display_value=true&?display_value=true&query=group_name&sys_id={issue}'
response = requests.get(q, headers=headers,auth=auth, proxies=proxies)
data = response.json()
raw_data = [api_response(k) for p in data['result']]
df = pd.DataFrame.from_records(raw_data)
dfs.append(df)
df = pd.concat(dfs, ignore_index=True)
how to find Total number of duplicate rows in to file and how to write python code
import csv
csv_data = csv.reader(file('T:\DataDump\Book1.csv'))
next(csv_data)
already_seen = set()
for row in csv_data:
Address = row[6]
if Address in already_seen:
print('{} is a duplicate Address'.format(Address))
else:
print('{} is a unique Address'.format(Address))
already_seen.add(Address)
Try using pandas instead of the csv module
import pandas as pd
csv_data = pd.read_csv('T:/DataDump/Book1.csv')
shape_original = csv_data.shape
print(f"Number of rows: {shape_original[0]}")
#Below how to drop duplicates
csv_data_no_duplicates = csv_data.drop_duplicates(keep="first")
shape_new = csv_data_no_duplicates.shape
print(f"Number of rows: {shape_new[0]}")
number_duplicates = shape_original[0] - shape_new[0]
I did with this example to try if it works:
thisdict = {
"brand": ["Ford","Renault","Ford"],
"model": ["Mustang","Laguna","Mustang"],
"year": ["1964","1978","1964"]
}
data = pd.DataFrame.from_dict(thisdict)
data_no_duplicates = data.drop_duplicates(keep="first")
print(data_no_duplicates.head())
I am trying to scrape the "PRINCIPAL STOCKHOLDERS" table from the linktext fileand convert it to a csv file. Right now I am only half successful. Namely, I can locate the table and parse it but somehow I cannot convert the text table to a standard one. My code is attached. Can someone help me with it?
url = r'https://www.sec.gov/Archives/edgar/data/1034239/0000950124-97-003372.txt'
# Different approach, the first approach does not work
filing_url = requests.get(url)
content = filing_url.text
splited_data = content.split('\n')
table_title = 'PRINCIPAL STOCKHOLDERS'
END_TABLE_LINE = '- ------------------------'
def find_no_line_start_table(table_title,splited_data):
found_no_lines = []
for index, line in enumerate(splited_data):
if table_title in line:
found_no_lines.append(index)
return found_no_lines
table_start = find_no_line_start_table(table_title,splited_data)
# I need help with locating the table. If I locate the table use the above function, it will return two locations and I have to manually choose the correct one.
table_start = table_start[1]
def get_start_data_table(table_start, splited_data):
for index, row in enumerate(splited_data[table_start:]):
if '<C>' in row:
return table_start + index
def get_end_table(start_table_data, splited_data ):
for index, row in enumerate(splited_data[start_table_data:]):
if END_TABLE_LINE in row:
return start_table_data + index
def row(l):
l = l.split()
number_columns = 8
if len(l) >= number_columns:
data_row = [''] * number_columns
first_column_done = False
index = 0
for w in l:
if not first_column_done:
data_row[0] = ' '.join([data_row[0], w])
if ':' in w:
first_column_done = True
else:
index += 1
data_row[index] = w
return data_row
start_line = get_start_data_table(table_start, splited_data)
end_line = get_end_table(start_line, splited_data)
table = splited_data[start_line : end_line]
# I also need help with convert the text table to a CSV file, somehow the following function does not #recognize my column.
def take_table(table):
owner = []
Num_share = []
middle = []
middle_1 = []
middle_2 = []
middle_3 = []
prior_offering = []
after_offering = []
for r in table:
data_row = row(r)
if data_row:
col_1, col_2, col_3, col_4, col_5, col_6, col_7, col_8 = data_row
owner.append(col_1)
Num_share.append(col_2)
middle.append(col_3)
middle_1.append(col_4)
middle_2.append(col_5)
middle_3.append(col_6)
prior_offering.append(col_7)
after_offering.append(col_8)
table_data = {'owner': owner, 'Num_share': Num_share, 'middle': middle, 'middle_1': middle_1,
'middle_2': middle_2, 'middle_3': middle_3, 'prior_offering': prior_offering,
'after_offering': after_offering}
return table_data
#print (table)
dict_table = take_table(table)
a = pd.DataFrame(dict_table)
a.to_csv('trail.csv')
I think what you need to do is
pd.DataFrame.from_dict(dict_table)
instead of
pd.DataFrame(dict_table)
I have stored streaming data of twitter using tweepy.I have extracted name, lang, country, and text from data and stored in one pandas dataframe.
Now I wanted to add gender field to same dataframe which is i am getting from gender api by GEt request by using for loop.
How can i add that gender column to same dataframe?
tweets_data contains all data, I am using nameparser to find first name
tweets['text'] = map(lambda tweet: tweet['text'], tweets_data)
tweets['lang'] = map(lambda tweet: tweet['lang'], tweets_data)
tweets['country'] = map(lambda tweet: tweet['place']['country'] if tweet['place'] != None else None, tweets_data)
tweets['name'] = map(lambda tweet: tweet['user']['name'], tweets_data)
tweets1=pd.DataFrame()
tweets1['name1'] = map(lambda tweet: tweet['user']['name'], tweets_data)
gender_data=[]
for i,v in tweets.iterrows():
try:
name1 = v['name']
name = HumanName(name1)
PARAMS = {'name':name['first']}
r = requests.get(url = URL, params = PARAMS)
data = r.json()
name = data['name']
gender = data['gender']
gender_data.append(gender)
print(gender_data)
except:
continue
tweets1=pd.DataFrame(gender_data,columns=['gender'])
tweets.merge(tweets1,how='left', left_on='name', right_on='name1')
Pandas allows your to just add the field.
Take the following frame:
my_frame = pd.DataFrame({'name': ['bob', 'jack']})
You can add the gender column like this:
my_frame['gender'] = [1,2]