Extract string from data frame in python - python

I am new in Python and I would like to extract a string data from my data frame. Here is my data frame:
Which state has the most counties in it?
Unfortunately I could not extract a string! Here is my code:
import pandas as pd
census_df = pd.read_csv('census.csv')
def answer_five():
return census_df[census_df['COUNTY']==census_df['COUNTY'].max()]['STATE']
answer_five()

How about this:
import pandas as pd
census_df = pd.read_csv('census.csv')
def answer_five():
"""
Returns the 'STATE' corresponding to the max 'COUNTY' value
"""
max_county = census_df['COUNTY'].max()
s = census_df.loc[census_df['COUNTY']==max_county, 'STATE']
return s
answer_five()
This should output a pd.Series object featuring the 'STATE' value(s) where 'COUNTY' is maxed. If you only want the value and not the Series (as your question stated, and since in your image there's only 1 max value for COUNTY) then return s[0] (instead of return s) should do.

def answer_five():
return census_df.groupby('STNAME')['COUNTY'].nunique().idxmax()
You can aggregate data using group by state name, then get count on unique counties and return id of max count.

I had the same issue for some reason I tried using .item() and manage to extract the exact value I needed.
In your case it would look like:
return census_df[census_df['COUNTY'] == census_df['COUNTY'].max()]['STATE'].item()

Related

I want to add an additional column to my df based on multiple condition on another column, all these condition are w.r.t string matching

import pandas as pd
name =
r"C:\Users\saarif2201\Desktop\Classification\Vidio_Indonesia\VidioDataset-Apr'22.xlsx"
df = pd.read_excel(name)
consist = ["Episode","Ep"]
def cat_marking(x):
if consist in (x):
return "Series"
else:
return ""
df['Content_Category'] = df['vod_episode_name'].apply(cat_marking)
This I have written to add a column name content_category and mark some of its rows as series based on the condition(that it contains Episode or Ep as a string) on vod_episode_name column.
I can't access the .xlsx file you posted, but I think that the following code should work for you if you want to apply your function to the vod_episode_name column.
import pandas as pd
name = r"C:\Users\saarif2201\Desktop\Classification\Vidio_Indonesia\VidioDataset-Apr'22.xlsx"
df = pd.read_excel(name)
consist = ["Episode","Ep"]
def cat_marking(x):
if any(ext in x for ext in consist):
return "Series"
else:
return ""
df['Content_Category'] = df['vod_episode_name'].apply(lambda x: cat_marking(x))

Python index value from list, return index value and list value

I'm trying to use a list of specific codes to index any time one of those codes is used and then return the value of that code and the parameter name associated with it.
import numpy as np
import pandas as pd
param_list = pd.read_csv(r'C:/Users/Gordo/Documents/GraduateSchool/Research/GroundWaterML/parameter_cd_query.csv')
#def p_list():
# return [param_list['p_cd'], param_list['param_nm']]
for item, value in param_list['p_cd'], param_list['parm_nm']:
if item in ['p00010','p00020','p00025','p00058','p00059','p00090','p00095','p00191','p00300','p00301','p00400','p00405','p00410',
'p00450','p00452','p00453','p00602','p00607','p00608','p00613','p00618','p00631','p00660','p00666','p00671',
'p00681','p00900','p00904','p00905','p00915','p00925','p00930','p00931','p00932','p00935','p00940',
'p00945','p00950','p00955','p01000','p01005','p01010','p01020','p01025','p01030','p01035','p01040','p01046',
'p01049','p01060','p01065','p01080','p01085','p01090','p01106','p01130','p01145','p01155','p04035','p07000',
'p09511','p22703','p29801','p39086','p49933','p50624','p61028','p62636','p62639','p62642','p62645',
'p63041','p63162','p63790','p70300','p70301','p70303','p71846','p71851','p71856','p71865','p71870','p72015',
'p72016','p72019','p82081','p82082','p82085','p90095','p99832','p99833','p99834']:
print (item, value)
If I understand your question correctly, you have your own predefined codes and you're trying to see if items from a csv file matches any of your codes. If that's the case, you can get all the matches just by filtering the dataframe (since you're using pandas anyway).
import pandas as pd
param_df = pd.read_csv(r'C:/Users/Gordo/Documents/GraduateSchool/Research/GroundWaterML/parameter_cd_query.csv')
my_codes = ['p00010','p00020','p00025','p00058','p00059','p00090','p00095','p00191','p00300','p00301','p00400','p00405','p00410',
'p00450','p00452','p00453','p00602','p00607','p00608','p00613','p00618','p00631','p00660','p00666','p00671',
'p00681','p00900','p00904','p00905','p00915','p00925','p00930','p00931','p00932','p00935','p00940',
'p00945','p00950','p00955','p01000','p01005','p01010','p01020','p01025','p01030','p01035','p01040','p01046',
'p01049','p01060','p01065','p01080','p01085','p01090','p01106','p01130','p01145','p01155','p04035','p07000',
'p09511','p22703','p29801','p39086','p49933','p50624','p61028','p62636','p62639','p62642','p62645',
'p63041','p63162','p63790','p70300','p70301','p70303','p71846','p71851','p71856','p71865','p71870','p72015',
'p72016','p72019','p82081','p82082','p82085','p90095','p99832','p99833','p99834']
result = param_df[param_df.p_cd.isin(my_codes)]
result gives you all matches. If you just want an array with the first match, you can do:
result.loc[0].values

Change Column values in pandas applying another function

I have a data frame in pandas, one of the columns contains time intervals presented as strings like 'P1Y4M1D'.
The example of the whole CSV:
oci,citing,cited,creation,timespan,journal_sc,author_sc
0200100000236252421370109080537010700020300040001-020010000073609070863016304060103630305070563074902,"10.1002/pol.1985.170230401","10.1007/978-1-4613-3575-7_2",1985-04,P2Y,no,no
...
I created a parsing function, that takes that string 'P1Y4M1D' and returns an integer number.
I am wondering how is it possible to change all the column values to parsed values using that function?
def do_process_citation_data(f_path):
global my_ocan
my_ocan = pd.read_csv("citations.csv",
names=['oci', 'citing', 'cited', 'creation', 'timespan', 'journal_sc', 'author_sc'],
parse_dates=['creation', 'timespan'])
my_ocan = my_ocan.iloc[1:] # to remove the first row iloc - to select data by row numbers
my_ocan['creation'] = pd.to_datetime(my_ocan['creation'], format="%Y-%m-%d", yearfirst=True)
return my_ocan
def parse():
mydict = dict()
mydict2 = dict()
i = 1
r = 1
for x in my_ocan['oci']:
mydict[x] = str(my_ocan['timespan'][i])
i +=1
print(mydict)
for key, value in mydict.items():
is_negative = value.startswith('-')
if is_negative:
date_info = re.findall(r"P(?:(\d+)Y)?(?:(\d+)M)?(?:(\d+)D)?$", value[1:])
else:
date_info = re.findall(r"P(?:(\d+)Y)?(?:(\d+)M)?(?:(\d+)D)?$", value)
year, month, day = [int(num) if num else 0 for num in date_info[0]] if date_info else [0,0,0]
daystotal = (year * 365) + (month * 30) + day
if not is_negative:
#mydict2[key] = daystotal
return daystotal
else:
#mydict2[key] = -daystotal
return -daystotal
#print(mydict2)
#return mydict2
Probably I do not even need to change the whole column with new parsed values, the final goal is to write a new function that returns average time of ['timespan'] of docs created in a particular year. Since I need parsed values, I thought it would be easier to change the whole column and manipulate a new data frame.
Also, I am curious what could be a way to apply the parsing function on each ['timespan'] row without modifying a data frame, I can only assume It could be smth like this, but I don't have a full understanding of how to do that:
for x in my_ocan['timespan']:
x = parse(str(my_ocan['timespan'])
How can I get a column with new values? Thank you! Peace :)
A df['timespan'].apply(parse) (as mentioned by #Dan) should work. You would need to modify only the parse function in order to receive the string as an argument and return the parsed string at the end. Something like this:
import pandas as pd
def parse_postal_code(postal_code):
# Splitting postal code and getting first letters
letters = postal_code.split('_')[0]
return letters
# Example dataframe with three columns and three rows
df = pd.DataFrame({'Age': [20, 21, 22], 'Name': ['John', 'Joe', 'Carla'], 'Postal Code': ['FF_222', 'AA_555', 'BB_111']})
# This returns a new pd.Series
print(df['Postal Code'].apply(parse_postal_code))
# Can also be assigned to another column
df['Postal Code Letter'] = df['Postal Code'].apply(parse_postal_code)
print(df['Postal Code Letter'])

converting google datastore query result to pandas dataframe in python

I need to convert a Google Cloud Datastore query result to a dataframe, to create a chart from the retrieved data. The query:
def fetch_times(limit):
start_date = '2019-10-08'
end_date = '2019-10-19'
query = datastore_client.query(kind='ParticleEvent')
query.add_filter(
'published_at', '>', start_date)
query.add_filter(
'published_at', '<', end_date)
query.order = ['-published_at']
times = query.fetch(limit=limit)
return times
creates a json like string of the results for each entity returned by the query:
Entity('ParticleEvent', 5942717456580608) {'gc_pub_sub_id': '438169950283983', 'data': '605', 'event': 'light intensity', 'published_at': '2019-10-11T14:37:45.407Z', 'device_id': 'e00fce6847be7713698287a1'}>
Thought I found something that would translate to json which I could convert to dataframe, but get an error that the properties attribute does not exist:
def to_json(gql_object):
result = []
for item in gql_object:
result.append(dict([(p, getattr(item, p)) for p in item.properties()]))
return json.dumps(result, cls=JSONEncoder)
Is there a way to iterate through the query results to get them into a dataframe either directly to a dataframe or by converting to json then to dataframe?
Datastore entities can be treated as Python base dictionaries! So you should be able to do something as simple as...
df = pd.DataFrame(datastore_entities)
...and pandas will do all the rest.
If you needed to convert the entity key, or any of its attributes to a column as well, you can pack them into the dictionary separately:
for e in entities:
e['entity_key'] = e.key
e['entity_key_name'] = e.key.name # for example
df = pd.DataFrame(entities)
You can use pd.read_json to read your json query output into a dataframe.
Assuming the output is the string that you have shared above, then the following approach can work.
#Extracting the beginning of the dictionary
startPos = line.find("{")
df = pd.DataFrame([eval(line[startPos:-1])])
Output looks like :
gc_pub_sub_id data event published_at \
0 438169950283983 605 light intensity 2019-10-11T14:37:45.407Z
device_id
0 e00fce6847be7713698287a1
Here, line[startPos:-1] is essentially the entire dictionary in that sthe string input. Using eval, we can convert it into an actual dictionary. Once we have that, it can be easily converted into a dataframe object
Original poster found a workaround, which is to convert each item in the query result object to string, and then manually parse the string to extract the needed data into a list.
The return value of the fetch function is google.cloud.datastore.query.Iterator which behaves like a List[dict] so the output of fetch can be passed directly into pd.DataFrame.
import pandas as pd
df = pd.DataFrame(fetch_times(10))
This is similar to #bkitej, but I added the use of the original poster's function.

in python pandas how to get value of dynamic series/ column data transformation

I am trying to merge two csv and transforming the values in one csv by looking up constant values in another csv.i am able to get series but but not able to get the correct cell value. Can you please suggest?
I am calling the below function in reading the main csv and transforming the language column
dataDF['language'] =
dataDF['language'].apply(translateLanguagetest)
def translateLanguagetest( keystring):
print("keystring" + keystring)
ref_Data_File = Path('C:\sampletest')/ "constant.csv"
refDataDF = pd.read_csv(ref_Data_File)
refDataDF['refKey']=refDataDF['sourcedomain']+"#"+refDataDF['value']
+"#"+refDataDF['targetdomain']
refDataDF['refValue']=refDataDF['target']
modRef= refDataDF['refValue'].where(refDataDF['refKey']==
'languageSRC#'+keystring+'#languagetarget')
print("modRef: "+modRef )
cleanedRef = modRef.dropna()
f(cleanedRef)
print(cleanedRef)
value = cleanedRef.loc[('refValue')]
return value
The contents of constant.csv is
value,sourcedomain,targetdomain,target
ita,languageSRC,languagetarget,it
eng,languageSRC,languagetarget,en
Got the solution and it was a simple one. Being new to python, took some time to find the answer. I am reading the constants csv before and passing the constants dataframe as parameter to the method for transformation of column value.
import unittest
from pathlib import Path
import pandas as pd
class AdvancedTestSuite(unittest.TestCase):
"""Advanced test cases."""
def test_transformation(self):
data_File = Path('C:\Test_python\stackflow')/ "data.csv"
data_mod_File = Path('C:\Test_python\stackflow')/ "data_mod.csv"
dataDF = pd.read_csv(data_File)
ref_Data_File = Path('C:\Test_python\stackflow')/ "constant.csv"
refDataDF = pd.read_csv(ref_Data_File)
refDataDF['refKey']=refDataDF['sourcedomain'] \
+"#"+refDataDF['value']+"#"+refDataDF['targetdomain']
refDataDF['refValue']=refDataDF['target']
dataDF['language'] = dataDF['language'].apply(
lambda x: translateLanguagetest(x, refDataDF))
dataDF['gender'] = dataDF['gender'].apply(
lambda x: translateGendertest(x, refDataDF))
dataDF.to_csv(data_mod_File,index=False)
def translateLanguagetest( keystring, refDataDF):
print("keystring" + keystring)
modRef= refDataDF['refValue'].where(refDataDF['refKey']==
'languageSRC#'+keystring+'#languagetarget')
#removes the NaN number. modRef is an numpy.ndarray.
cleanedRef = modRef.dropna()
#after ckeab up,since only one row is remaining, item to select the value
#with one element
value = cleanedRef.item()
return value
def translateGendertest( keystring, refDataDF):
print("keystring" + keystring)
modRef= refDataDF['refValue'].where(refDataDF['refKey']==
'genderSRC#'+keystring+'#gendertarget')
#removes the NaN number modRef is an numpy.ndarray.
cleanedRef = modRef.dropna()
#after ckeab up,since only one row is remaining, item to select the value
value = cleanedRef.item()
return value
if __name__ == '__main__':
unittest.main()
The data.csv before transformation
Id,language,gender
1,ita,male
2,eng,female
The constant.csv
value,sourcedomain,targetdomain,target
ita,languageSRC,languagetarget,it
eng,languageSRC,languagetarget,en
male,genderSRC,gendertarget,Male
female,genderSRC,gendertarget,Female
The csv after transformation:
Id,language,gender
1,it,Male
2,en,Female

Categories