I'm trying to use a list of specific codes to index any time one of those codes is used and then return the value of that code and the parameter name associated with it.
import numpy as np
import pandas as pd
param_list = pd.read_csv(r'C:/Users/Gordo/Documents/GraduateSchool/Research/GroundWaterML/parameter_cd_query.csv')
#def p_list():
# return [param_list['p_cd'], param_list['param_nm']]
for item, value in param_list['p_cd'], param_list['parm_nm']:
if item in ['p00010','p00020','p00025','p00058','p00059','p00090','p00095','p00191','p00300','p00301','p00400','p00405','p00410',
'p00450','p00452','p00453','p00602','p00607','p00608','p00613','p00618','p00631','p00660','p00666','p00671',
'p00681','p00900','p00904','p00905','p00915','p00925','p00930','p00931','p00932','p00935','p00940',
'p00945','p00950','p00955','p01000','p01005','p01010','p01020','p01025','p01030','p01035','p01040','p01046',
'p01049','p01060','p01065','p01080','p01085','p01090','p01106','p01130','p01145','p01155','p04035','p07000',
'p09511','p22703','p29801','p39086','p49933','p50624','p61028','p62636','p62639','p62642','p62645',
'p63041','p63162','p63790','p70300','p70301','p70303','p71846','p71851','p71856','p71865','p71870','p72015',
'p72016','p72019','p82081','p82082','p82085','p90095','p99832','p99833','p99834']:
print (item, value)
If I understand your question correctly, you have your own predefined codes and you're trying to see if items from a csv file matches any of your codes. If that's the case, you can get all the matches just by filtering the dataframe (since you're using pandas anyway).
import pandas as pd
param_df = pd.read_csv(r'C:/Users/Gordo/Documents/GraduateSchool/Research/GroundWaterML/parameter_cd_query.csv')
my_codes = ['p00010','p00020','p00025','p00058','p00059','p00090','p00095','p00191','p00300','p00301','p00400','p00405','p00410',
'p00450','p00452','p00453','p00602','p00607','p00608','p00613','p00618','p00631','p00660','p00666','p00671',
'p00681','p00900','p00904','p00905','p00915','p00925','p00930','p00931','p00932','p00935','p00940',
'p00945','p00950','p00955','p01000','p01005','p01010','p01020','p01025','p01030','p01035','p01040','p01046',
'p01049','p01060','p01065','p01080','p01085','p01090','p01106','p01130','p01145','p01155','p04035','p07000',
'p09511','p22703','p29801','p39086','p49933','p50624','p61028','p62636','p62639','p62642','p62645',
'p63041','p63162','p63790','p70300','p70301','p70303','p71846','p71851','p71856','p71865','p71870','p72015',
'p72016','p72019','p82081','p82082','p82085','p90095','p99832','p99833','p99834']
result = param_df[param_df.p_cd.isin(my_codes)]
result gives you all matches. If you just want an array with the first match, you can do:
result.loc[0].values
I have a data frame in pandas, one of the columns contains time intervals presented as strings like 'P1Y4M1D'.
The example of the whole CSV:
oci,citing,cited,creation,timespan,journal_sc,author_sc
0200100000236252421370109080537010700020300040001-020010000073609070863016304060103630305070563074902,"10.1002/pol.1985.170230401","10.1007/978-1-4613-3575-7_2",1985-04,P2Y,no,no
...
I created a parsing function, that takes that string 'P1Y4M1D' and returns an integer number.
I am wondering how is it possible to change all the column values to parsed values using that function?
def do_process_citation_data(f_path):
global my_ocan
my_ocan = pd.read_csv("citations.csv",
names=['oci', 'citing', 'cited', 'creation', 'timespan', 'journal_sc', 'author_sc'],
parse_dates=['creation', 'timespan'])
my_ocan = my_ocan.iloc[1:] # to remove the first row iloc - to select data by row numbers
my_ocan['creation'] = pd.to_datetime(my_ocan['creation'], format="%Y-%m-%d", yearfirst=True)
return my_ocan
def parse():
mydict = dict()
mydict2 = dict()
i = 1
r = 1
for x in my_ocan['oci']:
mydict[x] = str(my_ocan['timespan'][i])
i +=1
print(mydict)
for key, value in mydict.items():
is_negative = value.startswith('-')
if is_negative:
date_info = re.findall(r"P(?:(\d+)Y)?(?:(\d+)M)?(?:(\d+)D)?$", value[1:])
else:
date_info = re.findall(r"P(?:(\d+)Y)?(?:(\d+)M)?(?:(\d+)D)?$", value)
year, month, day = [int(num) if num else 0 for num in date_info[0]] if date_info else [0,0,0]
daystotal = (year * 365) + (month * 30) + day
if not is_negative:
#mydict2[key] = daystotal
return daystotal
else:
#mydict2[key] = -daystotal
return -daystotal
#print(mydict2)
#return mydict2
Probably I do not even need to change the whole column with new parsed values, the final goal is to write a new function that returns average time of ['timespan'] of docs created in a particular year. Since I need parsed values, I thought it would be easier to change the whole column and manipulate a new data frame.
Also, I am curious what could be a way to apply the parsing function on each ['timespan'] row without modifying a data frame, I can only assume It could be smth like this, but I don't have a full understanding of how to do that:
for x in my_ocan['timespan']:
x = parse(str(my_ocan['timespan'])
How can I get a column with new values? Thank you! Peace :)
A df['timespan'].apply(parse) (as mentioned by #Dan) should work. You would need to modify only the parse function in order to receive the string as an argument and return the parsed string at the end. Something like this:
import pandas as pd
def parse_postal_code(postal_code):
# Splitting postal code and getting first letters
letters = postal_code.split('_')[0]
return letters
# Example dataframe with three columns and three rows
df = pd.DataFrame({'Age': [20, 21, 22], 'Name': ['John', 'Joe', 'Carla'], 'Postal Code': ['FF_222', 'AA_555', 'BB_111']})
# This returns a new pd.Series
print(df['Postal Code'].apply(parse_postal_code))
# Can also be assigned to another column
df['Postal Code Letter'] = df['Postal Code'].apply(parse_postal_code)
print(df['Postal Code Letter'])
I need to convert a Google Cloud Datastore query result to a dataframe, to create a chart from the retrieved data. The query:
def fetch_times(limit):
start_date = '2019-10-08'
end_date = '2019-10-19'
query = datastore_client.query(kind='ParticleEvent')
query.add_filter(
'published_at', '>', start_date)
query.add_filter(
'published_at', '<', end_date)
query.order = ['-published_at']
times = query.fetch(limit=limit)
return times
creates a json like string of the results for each entity returned by the query:
Entity('ParticleEvent', 5942717456580608) {'gc_pub_sub_id': '438169950283983', 'data': '605', 'event': 'light intensity', 'published_at': '2019-10-11T14:37:45.407Z', 'device_id': 'e00fce6847be7713698287a1'}>
Thought I found something that would translate to json which I could convert to dataframe, but get an error that the properties attribute does not exist:
def to_json(gql_object):
result = []
for item in gql_object:
result.append(dict([(p, getattr(item, p)) for p in item.properties()]))
return json.dumps(result, cls=JSONEncoder)
Is there a way to iterate through the query results to get them into a dataframe either directly to a dataframe or by converting to json then to dataframe?
Datastore entities can be treated as Python base dictionaries! So you should be able to do something as simple as...
df = pd.DataFrame(datastore_entities)
...and pandas will do all the rest.
If you needed to convert the entity key, or any of its attributes to a column as well, you can pack them into the dictionary separately:
for e in entities:
e['entity_key'] = e.key
e['entity_key_name'] = e.key.name # for example
df = pd.DataFrame(entities)
You can use pd.read_json to read your json query output into a dataframe.
Assuming the output is the string that you have shared above, then the following approach can work.
#Extracting the beginning of the dictionary
startPos = line.find("{")
df = pd.DataFrame([eval(line[startPos:-1])])
Output looks like :
gc_pub_sub_id data event published_at \
0 438169950283983 605 light intensity 2019-10-11T14:37:45.407Z
device_id
0 e00fce6847be7713698287a1
Here, line[startPos:-1] is essentially the entire dictionary in that sthe string input. Using eval, we can convert it into an actual dictionary. Once we have that, it can be easily converted into a dataframe object
Original poster found a workaround, which is to convert each item in the query result object to string, and then manually parse the string to extract the needed data into a list.
The return value of the fetch function is google.cloud.datastore.query.Iterator which behaves like a List[dict] so the output of fetch can be passed directly into pd.DataFrame.
import pandas as pd
df = pd.DataFrame(fetch_times(10))
This is similar to #bkitej, but I added the use of the original poster's function.
I am trying to merge two csv and transforming the values in one csv by looking up constant values in another csv.i am able to get series but but not able to get the correct cell value. Can you please suggest?
I am calling the below function in reading the main csv and transforming the language column
dataDF['language'] =
dataDF['language'].apply(translateLanguagetest)
def translateLanguagetest( keystring):
print("keystring" + keystring)
ref_Data_File = Path('C:\sampletest')/ "constant.csv"
refDataDF = pd.read_csv(ref_Data_File)
refDataDF['refKey']=refDataDF['sourcedomain']+"#"+refDataDF['value']
+"#"+refDataDF['targetdomain']
refDataDF['refValue']=refDataDF['target']
modRef= refDataDF['refValue'].where(refDataDF['refKey']==
'languageSRC#'+keystring+'#languagetarget')
print("modRef: "+modRef )
cleanedRef = modRef.dropna()
f(cleanedRef)
print(cleanedRef)
value = cleanedRef.loc[('refValue')]
return value
The contents of constant.csv is
value,sourcedomain,targetdomain,target
ita,languageSRC,languagetarget,it
eng,languageSRC,languagetarget,en
Got the solution and it was a simple one. Being new to python, took some time to find the answer. I am reading the constants csv before and passing the constants dataframe as parameter to the method for transformation of column value.
import unittest
from pathlib import Path
import pandas as pd
class AdvancedTestSuite(unittest.TestCase):
"""Advanced test cases."""
def test_transformation(self):
data_File = Path('C:\Test_python\stackflow')/ "data.csv"
data_mod_File = Path('C:\Test_python\stackflow')/ "data_mod.csv"
dataDF = pd.read_csv(data_File)
ref_Data_File = Path('C:\Test_python\stackflow')/ "constant.csv"
refDataDF = pd.read_csv(ref_Data_File)
refDataDF['refKey']=refDataDF['sourcedomain'] \
+"#"+refDataDF['value']+"#"+refDataDF['targetdomain']
refDataDF['refValue']=refDataDF['target']
dataDF['language'] = dataDF['language'].apply(
lambda x: translateLanguagetest(x, refDataDF))
dataDF['gender'] = dataDF['gender'].apply(
lambda x: translateGendertest(x, refDataDF))
dataDF.to_csv(data_mod_File,index=False)
def translateLanguagetest( keystring, refDataDF):
print("keystring" + keystring)
modRef= refDataDF['refValue'].where(refDataDF['refKey']==
'languageSRC#'+keystring+'#languagetarget')
#removes the NaN number. modRef is an numpy.ndarray.
cleanedRef = modRef.dropna()
#after ckeab up,since only one row is remaining, item to select the value
#with one element
value = cleanedRef.item()
return value
def translateGendertest( keystring, refDataDF):
print("keystring" + keystring)
modRef= refDataDF['refValue'].where(refDataDF['refKey']==
'genderSRC#'+keystring+'#gendertarget')
#removes the NaN number modRef is an numpy.ndarray.
cleanedRef = modRef.dropna()
#after ckeab up,since only one row is remaining, item to select the value
value = cleanedRef.item()
return value
if __name__ == '__main__':
unittest.main()
The data.csv before transformation
Id,language,gender
1,ita,male
2,eng,female
The constant.csv
value,sourcedomain,targetdomain,target
ita,languageSRC,languagetarget,it
eng,languageSRC,languagetarget,en
male,genderSRC,gendertarget,Male
female,genderSRC,gendertarget,Female
The csv after transformation:
Id,language,gender
1,it,Male
2,en,Female