How to change python string into pandas data frame? - python

fellow developers in the StackOverflow.
I have string data in
'key=apple; age=10; key=boy; age=3'
How can we convert it into the pandas' data frame such that key and age will be the header and all the values in the column?
key age
apple 10
boy 3

Try this:
import pandas as pd
data = 'key=apple; age=10; key=boy; age=3'
words = data.split(";")
key = []
age = []
for word in words:
if "key" in word:
key.append(word.split("=")[1])
else:
age.append(word.split("=")[1])
df = pd.DataFrame(key, columns=["key"])
df["age"] = age
print(df)

You can try this:
import pandas as pd
str_stream = 'key=apple; age=10; key=boy; age=3'
lst_kv = str_stream.split(';')
# lst_kv => ['key=apple', ' age=10', ' key=boy', ' age=3']
res= [{s.split('=')[0].strip(): s.split('=')[1] for s in lst_kv[i:i+2]}
for i in range(len(lst_kv)//2)
]
df = pd.DataFrame(res)
df
Output:
key age
0 apple 10
1 boy 10
More explanation for one line res :
res = []
for i in range(len(lst_kv)//2):
dct_tmp = {}
for s in lst_kv[i:i+2]:
kv = s.split('=')
dct_tmp[kv[0].strip()] = kv[1]
res.append(dct_tmp)
res
Output:
[{'key': 'apple', 'age': '10'}, {'age': '10', 'key': 'boy'}]

Related

How to Replace Dataframe Column Values Based on Condition of Second Dataframe Values

When a match of replacements.csv > Link Changed > 'Yes' is found, I want to carry out the following:
match column replacements.csv > Fruit to main.csv > External Links
replace matching fruits found in main.csv > External Links with replacements.csv > Fruit Link
To demonstrate, I need the required output to be shown as below:
replacements.csv
Fruit,Fruit Link,Link Changed
banana,https://en.wikipedia.org/wiki/Banana,
blueberry,https://en.wikipedia.org/wiki/Blueberry,
strawberry,https://en.wikipedia.org/wiki/Strawberry,Yes
raspberry,https://en.wikipedia.org/wiki/Raspberry,Yes
cherry,https://en.wikipedia.org/wiki/Cherry,
apple,https://en.wikipedia.org/wiki/Apple,Yes
main.csv
Title,External Links
Smoothie Recipes,"['banana', 'blueberry', 'strawberry', 'raspberry', 'apple']"
Fruit Pies,"['cherry', 'apple']"
required output
Title,External Links
Smoothie Recipes,"['banana', 'blueberry', 'https://en.wikipedia.org/wiki/Strawberry', 'https://en.wikipedia.org/wiki/Raspberry', 'https://en.wikipedia.org/wiki/Apple']"
Fruit Pies,"['cherry', 'https://en.wikipedia.org/wiki/Apple']"
Code
import pandas as pd
replacements = pd.read_csv('replacements.csv')
main = pd.read_csv('main.csv')
all_scrapes = []
fruits_found = []
## Replace main.csv > External Links when replacements.csv > Link Changed = Yes
def swap_urls(fruit, fruit_link):
counter = 0
while counter < len(main):
title = main['Title'][counter]
external_links = main['External Links'][counter]
fruit_count = len(external_links.split(","))
fruit_item_row = main['External Links'][counter].replace("'","").replace("[","").replace("]","").replace(" ","") # [0] represents main.csv row
items = 0
while items < fruit_count:
single_fruit_list = fruit_item_row.split(',')[items]
if fruit in single_fruit_list:
print('Current Fruit Item:', single_fruit_list)
external_links = external_links.replace(fruit, fruit_link)
#fruits_found.append(fruit)
product = {
'Title': title,
'External Link': external_links,
#'Fruits Found': fruits_found,
}
print(' Product:', product)
all_scrapes.append(product)
else:
pass
items +=1
counter +=1
return all_scrapes
## Pass Fruit & Fruit Link values to function swap_urls when replacements.csv > Link Changed = Yes
y = 0
while y < len(replacements):
fruit = replacements['Fruit'][y]
fruit_link = replacements['Fruit Link'][y]
link_changed = replacements['Link Changed'][y]
if replacements['Link Changed'][y] == 'Yes':
print(f'replacement.csv row [{y}]: {fruit}, Fruit Link: {fruit_link}, Link Changed: \x1b[92m{link_changed}\x1b[0m')
swap_urls(fruit, fruit_link)
else:
print(f'replacement.csv row [{y}]: {fruit}, Fruit Link: {fruit_link}, Link Changed: No')
y +=1
## Save results to File
df = pd.DataFrame(all_scrapes)
print('DF:\n', df)
df.to_excel('Result.xlsx', index=False)
Issue
I'm able to identify the fruits in replacements.csv with their counterparts in main.csv, however I'm unable to update main.csv > External Links as a single entry when multiple fruits are found. See generated output file results.xlsx
Any help would be much appreciated.
Here is a relatively simple way to do this:
r = pd.read_csv('replacements.csv')
df = pd.read_csv('main.csv')
# make a proper list from the strings in 'External Links':
import ast
df['External Links'] = df['External Links'].apply(ast.literal_eval)
# make a dict for mapping
dct = r.dropna(subset=['Link Changed']).set_index('Fruit')['Fruit Link'].to_dict()
>>> dct
{'strawberry': 'https://en.wikipedia.org/wiki/Strawberry',
'raspberry': 'https://en.wikipedia.org/wiki/Raspberry',
'apple': 'https://en.wikipedia.org/wiki/Apple'}
# map, leaving the key by default
df['External Links'] = (
df['External Links'].explode().map(lambda k: dct.get(k, k))
.groupby(level=0).apply(pd.Series.tolist)
)
# result
>>> df
Title External Links
0 Smoothie Recipes [banana, blueberry, https://en.wikipedia.org/w...
1 Fruit Pies [cherry, https://en.wikipedia.org/wiki/Apple]
# result, as csv (to show quotation marks etc.)
>>> df.to_csv(index=False)
Title,External Links
Smoothie Recipes,"['banana', 'blueberry', 'https://en.wikipedia.org/wiki/Strawberry', 'https://en.wikipedia.org/wiki/Raspberry', 'https://en.wikipedia.org/wiki/Apple']"
Fruit Pies,"['cherry', 'https://en.wikipedia.org/wiki/Apple']"
import pandas as pd
replacements = pd.read_csv("replacements.csv")
main = pd.read_csv("main.csv")
# returns replacement link or fruit
def fruit_link(x):
if x not in (replacements['Fruit'].values):
return x
return replacements.loc[replacements['Fruit'] == x, 'Fruit Link'].values[0]\
if replacements.loc[replacements['Fruit'] == x, 'Link Changed'].values == 'Yes' else x
# split string of list to list
main["External Links"] = main["External Links"].apply(lambda x: x[1:-1].split(', '))
# explode main to fruits
main = main.explode("External Links")
# remove quotes from fruit names
main["External Links"] = main["External Links"].apply(lambda x: x[1:-1])
# applying fruit_link to retrieve link or fruit
main["External Links"] = main["External Links"].apply(fruit_link)
# implode back
main = main.groupby('Title').agg({'External Links': lambda x: x.tolist()}).reset_index()
OUTPUT:
Title External Links
0 Fruit Pies [cherry, https://en.wikipedia.org/wiki/Apple]
1 Smoothie Recipes [grape, banana, blueberry, https://en.wikipedia.org/wiki/Strawberry, https://en.wikipedia.org/wiki/Raspberry, https://en.wikipedia.org/wiki/Apple, plum]

Rank the row based on the similar text using python?

How to rank the data frame based on the row value. i.e I have a row that contains text data want to provide the rank based on the similarity?
Expected output
i have tried with the levistian distance but not sure how can i do for the whole table
def bow(x=None):
x = x.lower()
words = x.split(' ')
words.sort()
x = ' '.join(words)
exclude = set('{}{}'.format(string.punctuation, string.digits))
x = ''.join(ch for ch in x if ch not in exclude)
x = '{} '.format(x.strip())
return x
#intents = load_intents(export=True)
df['bow'] = df['name'].apply(lambda x: bow(x))
df.sort_values(by='bow',ascending=True,inplace=True)
last_bow = ''
recs = []
for idx,row in df.iterrows():
record = {
'name': row['name'],
'bow': row['bow'],
'lev_distance': ed.eval(last_bow,row['bow'])
}
recs.append(record)
last_bow = row['bow']
intents = pd.DataFrame(recs,columns=['name','bow','lev_distance'])
l = intents[intents['lev_distance'] <= lev_distance_range]
r = []
for x in l.index.values:
r.append(x - 1)
r.append(x)
r = list(set(r))
l = intents.iloc[r,:]
Using textdistance, you could try this:
import pandas as pd
import textdistance
df = pd.DataFrame(
{
"text": [
"Rahul dsa",
"Rasul dsad",
"Raul ascs",
"shrez",
"Indya",
"Indi",
"shez",
"india",
"kloa",
"klsnsd",
],
}
)
df = (
df
.assign(
match=df["text"].map(
lambda x: [
i
for i, text in enumerate(df["text"])
if textdistance.jaro_winkler(x, text) >= 0.9
]
)
)
.sort_values(by="match")
.drop(columns="match")
)
print(df)
# Output
text
0 Rahul dsa
1 Rasul dsad
2 Raul ascs
3 shrez
6 shez
4 Indya
5 Indi
7 india
8 kloa
9 klsnsd

Pandas how to search one df for a certain date and return that data

I have two data frames and I am trying to search each row by date in the user.csv file and find the corresponding date in the Raven.csv file and then return the Price from the df1 and the date and amount from df2.
This is working but my Price is returning a value like this [[0.11465]], is there a way to remove these brackets or a better way to do this?
import pandas as pd
df1 = pd.read_csv('Raven.csv',)
df2 = pd.read_csv('User.csv')
df1 = df1.reset_index(drop=False)
df1.columns = ['index', 'Date', 'Price']
df2['Timestamp'] = pd.to_datetime(df2['Timestamp'], format="%Y-%m-%d %H:%M:%S").dt.date
df1['Date'] = pd.to_datetime(df1['Date'], format="%Y-%m-%d").dt.date
Looper = 0
Date = []
Price = []
amount = []
total_value = []
for x in df2['Timestamp']:
search = df2['Timestamp'].values[Looper]
Date.append(search)
price =(df1.loc[df1['Date'] == search,['index']] )
value = df1['Price'].values[price]
Price.append(value)
payout = df2['Amount'].values[Looper]
amount.append(payout)
payout_value = value * payout
total_value.append(payout_value)
Looper = Looper + 1
dict = {'Date': Date, 'Price': Price, 'Payout': amount, "Total Value": total_value}
df = pd.DataFrame(dict)
df.to_csv('out.csv')
You can do indexing to get the value:
value = [[0.11465]][0][0]
print(value)
You get:
0.11465
I hope this is what you need.

how to form correct JSON by reading csv data and use in API payload

I am using the below code to form a JSON, by reading data from csv
df = pd.read_csv('/testdata.csv', dtype={
"debt_type": str,
"debt_amount": int,
"interest_rate": float,
"total_monthly_payment": int,
"remaining_term,interest_payable": int})
finalList = []
finalDict = {}
grouped = df.groupby(['debt_type'])
for key, value in grouped:
dictionary = {}
j = grouped.get_group(key).reset_index(drop=True)
dictionary['debt_type'] = j.at[0, 'debt_type']
dictList = []
anotherDict = {}
for i in j.index:
anotherDict['debt_amount'] = j.at[i, 'debt_amount']
anotherDict['interest_rate'] = j.at[i, 'interest_rate']
anotherDict['total_monthly_payment'] = j.at[i, 'total_monthly_payment']
anotherDict['remaining_term'] = j.at[i, 'remaining_term']
anotherDict['interest_payable'] = j.at[i, 'interest_payable']
dictList.append(anotherDict)
dictionary['loan_info'] = dictList
finalList.append(dictionary)
finalDict = finalList
and want to achieve below
{"loan_info":{"debt_amount":9000,"interest_rate":23,"total_monthly_payment":189,"remaining_term":129,"interest_payable":15356},"debt_type":"credit_card"}
however, what I am getting is below
[{'debt_type': 'credit_card', 'loan_info': [{'debt_amount': 9000, 'interest_rate': 12.2, 'total_monthly_payment': 189, 'remaining_term': 129, 'interest_payable': 15256}]}]
can anyone help here. thanks in advance.
I think what you need is to use pandas.DataFrame.to_dict() and pandas.DataFrame.to_json().
Right after you read your csv file, you can create a new column loan_info that will format all the fields you want to a Python dictionary :
loan_info_cols = ['debt_amount', 'interest_rate', 'total_monthly_payment', 'remaining_term', 'interest_payable']
df['loan_info'] = df[loan_info_cols].apply(lambda x: x.to_dict(), axis=1)
Then drop the columns we just used :
df = df.drop(loan_info_cols, axis=1)
This is what we have so far :
print(df)
debt_type loan_info
0 credit_card {u'total_monthly_payment': 189.0, u'interest_p...
1 debit_card {u'total_monthly_payment': 165.0, u'interest_p...
Now you can convert the whole dataframe to JSON :
df_json = df.to_json(orient='records', lines=True)
print(df_json)
{"debt_type":"credit_card","loan_info":{"total_monthly_payment":189.0,"interest_payable":15356.0,"interest_rate":23.0,"debt_amount":9000.0,"remaining_term":129.0}}
{"debt_type":"debit_card","loan_info":{"total_monthly_payment":165.0,"interest_payable":21354.0,"interest_rate":24.0,"debt_amount":8000.0,"remaining_term":167.0}}

Q: Pandas dataframe from for loop

EDIT 2, 9/1 See my answer below!
Pretty new at Python and Pandas here. I've got a script here that uses a for loop to query my database using each line in my list. That all works great, but I can't figure out how to build a data frame from the results of that loop. Any and all pointers are welcome!
#Remove stuff
print "Cleaning list"
def multiple_replacer(key_values):
replace_dict = dict(key_values)
replacement_function = lambda match: replace_dict[match.group(0)]
pattern = re.compile("|".join([re.escape(k) for k, v in key_values]), re.M)
return lambda string: pattern.sub(replacement_function, string)
multi_line = multiple_replacer(key_values)
print "Querying Database..."
for line in source:
brand_url = multi_line(line)
#Run Query with cleaned list
mysql_query = ("select ub.url as 'URL', b.name as 'Name', b.id as 'ID' from api.brand b join api.url_brand ub on b.id=ub.brand_id where ub.url like '%%%s%%' and b.deleted=0 group by 3;" % brand_url)
list1 = []
brands = my_query('prod', mysql_query)
print "Writing CSV..."
#Create DF and CSV
for row in brands:
list1.append({"URL":row['URL'],"Name":['Name'],"ID":['ID']})
if brands.shape == (3,0):
df1 = pd.DataFrame(data = brands, columns=['URL','Name','ID'])
output = df1.to_csv('ongoing.csv',index=False)
EDIT 8/30
Here is my edit, attempting to use zyxue's method:
#Remove stuff
print "Cleaning list"
def multiple_replacer(key_values):
replace_dict = dict(key_values)
replacement_function = lambda match: replace_dict[match.group(0)]
pattern = re.compile("|".join([re.escape(k) for k, v in key_values]), re.M)
return lambda string: pattern.sub(replacement_function, string)
multi_line = multiple_replacer(key_values)
print "Querying Database..."
for line in source:
brand_url = multi_line(line)
#Run Query with cleaned list
mysql_query = ("select ub.url as 'URL', b.name as 'Name', b.id as 'ID' from api.brand b join api.url_brand ub on b.id=ub.brand_id where ub.url like '%%%s%%' and b.deleted=0 group by 3;" % brand_url)
brands = my_query('prod', mysql_query)
print "Writing CSV..."
#Create DF and CSV
records = []
for row in brands:
records.append({"URL":row['URL'],"Name":['Name'],"ID":['ID']})
if brands.shape == (3,0):
records.append(dict(zip(brands, ['URL', 'Name', 'ID'])))
df1 = pd.DataFrame.from_records(records)
output = df1.to_csv('ongoing.csv', index=False)
but this only returns a blank CSV. I'm sure I'm applying it wrong.
records = []
for row in brands:
# if brands.shape == (3,0):
# records.append(dict(zip(brands, ['URL', 'Name', 'ID'])))
# update bug fix:
if row.shape == (3,0):
records.append(dict(zip(row, ['URL', 'Name', 'ID'])))
df1 = pd.DataFrame.from_records(records)
output = df1.to_csv('ongoing.csv', index=False)
# ref:
# >>> pd.DataFrame.from_records([{'a': 1, 'b':2}, {'a': 11, 'b': 22}])
# a b
# 0 1 2
# 1 11 22
Okay, I figured it out, and I thought I should post the working script. #zyxue was pretty much right.
source = open('urls.txt')
key_values = ("http://",""), ("https://",""), ("www.",""), ("\n","")
#Remove stuff
print "Cleaning list"
def multiple_replacer(key_values):
replace_dict = dict(key_values)
replacement_function = lambda match: replace_dict[match.group(0)]
pattern = re.compile("|".join([re.escape(k) for k, v in key_values]), re.M)
return lambda string: pattern.sub(replacement_function, string)
multi_line = multiple_replacer(key_values)
print "Querying Database..."
records = []
for line in source:
brand_url = multi_line(line)
#Run Query with cleaned list
mysql_query = ("select ub.url as 'URL', b.name as 'Name', b.id as 'ID' from api.brand b join api.url_brand ub on b.id=ub.brand_id where ub.url like '%%%s%%' and b.deleted=0 group by 3;" % brand_url)
brands = my_query('prod', mysql_query)
#Append results to dict (records)
for row in brands:
records.append({"URL":row['URL'],"Name":row['Name'],"ID":row['ID']})
#Create DataFrame
df = pd.DataFrame.from_dict(records)
#Create CSV
output = df.to_csv('ongoing.csv',index=False)
Essentially, I needed to layer the second for loop under the first and create the 'records' dictionary before the looping began. This causes an append to the dictionary for every line in 'source'. Seems like a pretty simple concept now!

Categories