Convert list of lists to custom dictionary - python

I'm unsuccessfully trying to convert list of lists to a custom dictionary.
I've created the following output saved in two lists:
headers = ['CPU', 'name', 'id', 'cused', 'callc', 'mused', 'mallc']
result = [['1/0', 'aaa', '10', '0.1', '15', '10.73', '16.00'],
['1/0', 'bbb', '10', '0.1', '20', '11.27', '14.00'],
['1/0', 'ccc', '10', '0.2', '10', '11.50', '15.00'],
['1/0', 'aaa', '10', '1.1', '15', '15.10', '23.00']]
Formatted output:
headers:
slot name id cused callc mused mallc
result:
1/0 aaa 10 0.1 15 10.73 16.00
2/0 bbb 25 0.1 20 11.39 14.00
1/0 ccc 10 0.2 10 11.50 15.00
1/0 aaa 10 1.1 15 15.10 23.00
The first n columns (3 in this case) should be used to concatenate key name with all of the remaining columns as output values.
I would like to convert it to a dictionary in the following format:
slot.<slot>.name.<name>.id.<id>.cused:<value>,
slot.<slot>.name.<name>.id.<id>.callc:<value>,
slot.<slot>.name.<name>.id.<id>.mused:<value>,
slot.<slot>.name.<name>.id.<id>.mallc:<value>,
...
for example:
dictionary = {
'slot.1/0.name.aaa.id.10.cused':10,
'slot.1/0.name.aaa.id.25.callc':15,
'slot.1/0.name.aaa.id.10.mused':10.73,
'slot.1/0.name.aaa.id.10.mallc':16.00,
'slot.2/0.name.bbb.id.10.cused':0.1,
...
'slot.<n>.name.<name>.id.<id>.<value_name> <value>
}
Can you show me how that can be done?

Updated - OP added raw lists
Now that you have updated the question to show the raw list it's even easier:
headers = ['CPU', 'name', 'id', 'cused', 'callc', 'mused', 'mallc']
result = [['1/0', 'aaa', '10', '0.1', '15', '10.73', '16.00'],
['1/0', 'bbb', '10', '0.1', '20', '11.27', '14.00'],
['1/0', 'ccc', '10', '0.2', '10', '11.50', '15.00'],
['1/0', 'aaa', '10', '1.1', '15', '15.10', '23.00']]
results = {}
for r in result:
slot, name, _id = r[:3]
results.update(
{'slot.{}.name.{}.id.{}.{}'.format(slot, name, _id, k) : v
for k, v in zip(headers[3:], r[3:])})
>>> from pprint import pprint
>>> pprint(results)
{'slot.1/0.name.aaa.id.10.callc': '15',
'slot.1/0.name.aaa.id.10.cused': '1.1',
'slot.1/0.name.aaa.id.10.mallc': '23.00',
'slot.1/0.name.aaa.id.10.mused': '15.10',
'slot.1/0.name.bbb.id.10.callc': '20',
'slot.1/0.name.bbb.id.10.cused': '0.1',
'slot.1/0.name.bbb.id.10.mallc': '14.00',
'slot.1/0.name.bbb.id.10.mused': '11.27',
'slot.1/0.name.ccc.id.10.callc': '10',
'slot.1/0.name.ccc.id.10.cused': '0.2',
'slot.1/0.name.ccc.id.10.mallc': '15.00',
'slot.1/0.name.ccc.id.10.mused': '11.50'}
Original file based answer
The following code will construct the required dictionary (results). The idea is that each non-header line in the file is split by whitespace into fields, and the fields are used in a dictionary comprehension to construct a dictionary for each line, which is then used to update the results dictionary.
with open('data') as f:
# skip the 3 header lines
for i in range(3):
_ = next(f)
STAT_NAMES = 'cused callc mused mallc'.split()
results = {}
for line in f:
line = line.split()
slot, name, _id = line[:3]
results.update(
{'slot.{}.name.{}.id.{}.{}'.format(slot, name, _id, k) : v
for k, v in zip(STAT_NAMES, line[3:])})
Output
>>> from pprint import pprint
>>> pprint(results)
{'slot.1/0.name.aaa.id.10.callc': '15',
'slot.1/0.name.aaa.id.10.cused': '1.1',
'slot.1/0.name.aaa.id.10.mallc': '23.00',
'slot.1/0.name.aaa.id.10.mused': '15.10',
'slot.1/0.name.ccc.id.10.callc': '10',
'slot.1/0.name.ccc.id.10.cused': '0.2',
'slot.1/0.name.ccc.id.10.mallc': '15.00',
'slot.1/0.name.ccc.id.10.mused': '11.50',
'slot.2/0.name.bbb.id.25.callc': '20',
'slot.2/0.name.bbb.id.25.cused': '0.1',
'slot.2/0.name.bbb.id.25.mallc': '14.00',
'slot.2/0.name.bbb.id.25.mused': '11.39'}

try this, Note: i changed "slot" instead of "CPU"
headers = ['slot', 'name', 'id', 'cused', 'callc', 'mused', 'mallc']
result = [['1/0', 'aaa', '10', '0.1', '15', '10.73', '16.00'],
['1/0', 'bbb', '10', '0.1', '20', '11.27', '14.00'],
['1/0', 'ccc', '10', '0.2', '10', '11.50', '15.00'],
['1/0', 'aaa', '10', '1.1', '15', '15.10', '23.00']]
#I get: [['1/0', '1/0', '1/0', '1/0'], ['aaa', 'bbb', 'ccc', 'aaa'], ....
transpose_result = map(list, zip(*result))
#I get: {'slot': ['1/0', '1/0', '1/0', '1/0'],
# 'mallc': ['16.00', '14.00', '15.00', '23.00'], ...
data = dict(zip(headers, transpose_result))
d = {}
for reg in ("cused", "callc", "mused", "mallc"):
for i, val in enumerate(data[reg]):
key = []
for reg2 in ("slot", "name", "id"):
key.append(reg2)
key.append(data[reg2][i])
key.append(reg)
d[".".join(key)] = val
you get in d
{
'slot.1/0.name.bbb.id.10.cused': '0.1',
'slot.1/0.name.aaa.id.10.cused': '1.1',
'slot.1/0.name.bbb.id.10.callc': '20',
'slot.1/0.name.aaa.id.10.mallc': '23.00',
'slot.1/0.name.aaa.id.10.callc': '15',
'slot.1/0.name.ccc.id.10.mallc': '15.00',
'slot.1/0.name.ccc.id.10.mused': '11.50',
'slot.1/0.name.aaa.id.10.mused': '15.10',
'slot.1/0.name.ccc.id.10.cused': '0.2',
'slot.1/0.name.ccc.id.10.callc': '10',
'slot.1/0.name.bbb.id.10.mallc': '14.00',
'slot.1/0.name.bbb.id.10.mused': '11.27'}

import itertools
headers = 'slot name id cused callc mused mallc'.split()
result = ['1/0 aaa 10 0.1 15 10.73 16.00'.split(),
'2/0 bbb 25 0.1 20 11.39 14.00'.split()]
key_len = 3
d = {}
for row in result:
key_start = '.'.join(itertools.chain(*zip(headers, row[:key_len])))
for key_end, val in zip(headers[key_len:], row[key_len:]):
d[key_start + '.' + key_end] = val

another solution with the correct type for cused, callc, mused and mallc
labels = ['slot','name','id','cused','callc','mused','mallc']
data = ['1/0 aaa 10 0.1 15 10.73 16.00',
'2/0 bbb 25 0.1 20 11.39 14.00',
'1/0 ccc 10 0.2 10 11.50 15.00',
'1/0 aaa 10 1.1 15 15.10 23.00']
data = [tuple(e.split()) for e in data]
data = [zip(labels, e) for e in data]
results = dict()
for e in data:
s = '%s.%s.%s' % tuple(['.'.join(e[i]) for i in range(3)])
for i in range(3,7):
results['%s.%s' % (s, e[i][0])] = int(e[i][1]) if i == 4 else float(e[i][1])
print results

Related

Add columns to dataframe based on a dictionary

If have a dataframe like this:
df = pd.DataFrame({
'ID': ['1', '4', '4', '3', '3', '3'],
'club': ['arts', 'math', 'theatre', 'poetry', 'dance', 'cricket']
})
and I have a dictionary named tag_dict:
{'1': {'Granted'},
'3': {'Granted'}}
The keys of the dictionary match with some IDs in the ID column on data frame.
Now, I want to create a new column "Tag" in Dataframe such that
If a value in the ID column matches with the keys of a dictionary, then we have to place the value of that key in the dictionary else place '-' in that field
The output should look like this:
df = PD.DataFrame({
'ID': ['1', '4', '4', '3', '3', '3'],
'club': ['arts', 'math', 'theatre', 'poetry', 'dance', 'cricket'],
'tag':['Granted','-','-','Granted','Granted','Granted']
})
import pandas as pd
df = pd.DataFrame({
'ID': ['1', '4', '4', '3', '3', '3'],
'club': ['arts', 'math', 'theatre', 'poetry', 'dance', 'cricket']})
# I've removed the {} around your items. Feel free to add more key:value pairs
my_dict = {'1': 'Granted', '3': 'Granted'}
# use .map() to match your keys to their values
df['Tag'] = df['ID'].map(my_dict)
# if required, fill in NaN values with '-'
nan_rows = df['Tag'].isna()
df.loc[nan_rows, 'Tag'] = '-'
df
End result:
I'm not sure what the purpose of the curly brackets arount Granted is but you could use apply:
df = pd.DataFrame({
'ID': ['1', '4', '4', '3', '3', '3'],
'club': ['arts', 'math', 'theatre', 'poetry', 'dance', 'cricket']
})
tag_dict = {'1': 'Granted',
'3': 'Granted'}
df['tag'] = df['ID'].apply(lambda x: tag_dict.get(x, '-'))
print(df)
Output:
ID club tag
0 1 arts Granted
1 4 math -
2 4 theatre -
3 3 poetry Granted
4 3 dance Granted
5 3 cricket Granted
Solution with .map:
df["tag"] = df["ID"].map(dct).apply(lambda x: "-" if pd.isna(x) else [*x][0])
print(df)
Prints:
ID club tag
0 1 arts Granted
1 4 math -
2 4 theatre -
3 3 poetry Granted
4 3 dance Granted
5 3 cricket Granted

How to transform index values into columns using Pandas?

I have a dictionary like this:
my_dict = {'RuleSet': {'0': {'RuleSetID': '0',
'RuleSetName': 'Allgemein',
'Rules': [{'RulesID': '10',
'RuleName': 'Gemeinde Seiten',
'GroupHits': '2',
'KeyWordGroups': ['100', '101', '102']}]},
'1': {'RuleSetID': '1',
'RuleSetName': 'Portale Berlin',
'Rules': [{'RulesID': '11',
'RuleName': 'Portale Berlin',
'GroupHits': '4',
'KeyWordGroups': ['100', '101', '102', '107']}]},
'6': {'RuleSetID': '6',
'RuleSetName': 'Zwangsvollstr. Berlin',
'Rules': [{'RulesID': '23',
'RuleName': 'Zwangsvollstr. Berlin',
'GroupHits': '1',
'KeyWordGroups': ['100', '101']}]}}}
When using this code snippet it can be transformed into a dataframe:
rules_pd = pd.DataFrame(my_dict['RuleSet'])
rules_pd
The result is:
I would like to make it look like this:
Does anyone know how to tackle this challenge?
Doing from_dict with index
out = pd.DataFrame.from_dict(my_dict['RuleSet'],'index')
Out[692]:
RuleSetID ... Rules
0 0 ... [{'RulesID': '10', 'RuleName': 'Gemeinde Seite...
1 1 ... [{'RulesID': '11', 'RuleName': 'Portale Berlin...
6 6 ... [{'RulesID': '23', 'RuleName': 'Zwangsvollstr....
[3 rows x 3 columns]
#out.columns
#Out[693]: Index(['RuleSetID', 'RuleSetName', 'Rules'], dtype='object')
You could try use Transpose()
rules_pd = pd.DataFrame(my_dict['RuleSet']).transpose()
print(rules_pd)

PANDAS create column by iterating row by row checking values in 2nd dataframe until all values are true

df1 = pd.DataFrame({
'ItemNo' : ['001', '002', '003', '004', '005'],
'L' : ['5', '65.0', '445.0', '3200', '65000'],
'H' : ['2', '15.5', '150.5', '1500', '54000'],
'W' : ['5', '85.0', '640.0', '1650', '45000']
})
df2 = pd.DataFrame({
'Rank' : ['1','2','3','4','5'],
'Length': ['10', '100', '1000', '10000'],
'Width' : ['10', '100', '1000', '10000'],
'Height': ['10', '100', '1000', '10000'],
'Code' : [ 'aa', 'bb', 'cc', 'dd', 'ee']
})
So here I have two example dataframes. The first dataframe shows unique item numbers with given dimensions. df2 shows maximum allowable dimensions for given rank and code. Meaning all elements (length, width, height) must not exceed maximum given dimensions. I would like to check the dimensions in df1 against df2 until all dimension criteria are True in order to retrieve it's 'rank' and 'code'. So, in essence, iterate down row by row of df2 until all the criteria is True.
Make a new df3 as follows:
ItemNo Rank Code
001 1 aa
002 2 bb
003 3 cc
004 4 dd
005 5 ee
Using a numpy
changed sample data so that it's not just incrementing results
get index of row in df2 that matches required logic
build df3 using index in step 2
df1 = pd.DataFrame({
'ItemNo' : ['001', '002', '003', '004', '005'],
'L' : ['5', '65.0', '445.0', '5', '65000'],
'H' : ['2', '15.5', '150.5', '5', '54000'],
'W' : ['5', '85.0', '640.0', '5', '45000']
})
df2 = pd.DataFrame({
'Rank' : ['1','2','3','4','5'],
'Length': ['10', '100', '1000', '10000',100000],
'Width' : ['10', '100', '1000', '10000',100000],
'Height': ['10', '100', '1000', '10000',100000],
'Code' : [ 'aa', 'bb', 'cc', 'dd', 'ee']
})
# fix up datatypes for comparisons
df1.loc[:,["L","H","W"]] = df1.loc[:,["L","H","W"]].astype(float)
df2.loc[:,["Length","Height","Width"]] = df2.loc[:,["Length","Height","Width"]].astype(float)
# row by row comparison, argmax to get first True
idx = [np.argmax((df1.loc[r,["L","H","W"]].values
< df2.loc[:,["Length","Height","Width"]].values).all(axis=1))
for r in df1.index]
# finally the result
pd.concat([df1.ItemNo, df2.loc[idx,["Rank","Code"]].reset_index(drop=True)],axis=1)
ItemNo
Rank
Code
0
001
1
aa
1
002
2
bb
2
003
3
cc
3
004
1
aa
4
005
5
ee
I think you can try:
import pandas as pd
import numpy as np
df1 = pd.DataFrame({
'ItemNo' : ['001', '002', '003', '004', '005', '006'],
'L' : ['5', '65.0', '445.0', '3200', '65000', '10'],
'H' : ['2', '15.5', '150.5', '1500', '54000','1000'],
'W' : ['5', '85.0', '640.0', '1650', '45000', '10']
})
df2 = pd.DataFrame({
'Rank' : ['1','2','3','4','5'],
'Length': ['10', '100', '1000', '10000', '100000'],
'Width' : ['10', '100', '1000', '10000', '100000'],
'Height': ['10', '100', '1000', '10000', '100000'],
'Code' : [ 'aa', 'bb', 'cc', 'dd', 'ee']
})
df_sort = pd.DataFrame({'W': np.searchsorted(df2['Width'].astype(float), df1['W'].astype(float)),
'H': np.searchsorted(df2['Height'].astype(float), df1['H'].astype(float)),
'L': np.searchsorted(df2['Length'].astype(float), df1['L'].astype(float))})
df1['Rank'] = df_sort.max(axis=1).map(df2['Rank'])
df1['Code'] = df1['Rank'].map(df2.set_index('Rank')['Code'])
print(df1)
Output:
ItemNo L H W Rank Code
0 001 5 2 5 1 aa
1 002 65.0 15.5 85.0 2 bb
2 003 445.0 150.5 640.0 3 cc
3 004 3200 1500 1650 4 dd
4 005 65000 54000 45000 5 ee
5 006 10 1000 10 3 cc
The core to the code is the use of the np.searchsorted function. Which is used to find the index of the value of L in Length for example per the conditions listed in the documentations. So, I use np.searchsorted for each of the three dimension then, I take the largest value using max(axis=1) and assign the rank and code based on that largest value using map.

How to read a JSON retrieved from an API and save it into a CSV file?

I am using a weather API that responses with a JSON file. Here is a sample of the returned readings:
{
'data': {
'request': [{
'type': 'City',
'query': 'Karachi, Pakistan'
}],
'weather': [{
'date': '2019-03-10',
'astronomy': [{
'sunrise': '06:46 AM',
'sunset': '06:38 PM',
'moonrise': '09:04 AM',
'moonset': '09:53 PM',
'moon_phase': 'Waxing Crescent',
'moon_illumination': '24'
}],
'maxtempC': '27',
'maxtempF': '80',
'mintempC': '22',
'mintempF': '72',
'totalSnow_cm': '0.0',
'sunHour': '11.6',
'uvIndex': '7',
'hourly': [{
'time': '24',
'tempC': '27',
'tempF': '80',
'windspeedMiles': '10',
'windspeedKmph': '16',
'winddirDegree': '234',
'winddir16Point': 'SW',
'weatherCode': '116',
'weatherIconUrl': [{
'value': 'http://cdn.worldweatheronline.net/images/wsymbols01_png_64/wsymbol_0002_sunny_intervals.png'
}],
'weatherDesc': [{
'value': 'Partly cloudy'
}],
'precipMM': '0.0',
'humidity': '57',
'visibility': '10',
'pressure': '1012',
'cloudcover': '13',
'HeatIndexC': '25',
'HeatIndexF': '78',
'DewPointC': '15',
'DewPointF': '59',
'WindChillC': '24',
'WindChillF': '75',
'WindGustMiles': '12',
'WindGustKmph': '19',
'FeelsLikeC': '25',
'FeelsLikeF': '78',
'uvIndex': '0'
}]
}]
}
}
I used the following Python code in my attempt to reading the data stored in JSON file:
import simplejson as json
data_file = open("new.json", "r")
values = json.load(data_file)
But this outputs with an error as follows:
JSONDecodeError: Expecting value: line 1 column 1 (char 0) error
I am also wondering how I can save the result in a structured format in a CSV file using Python.
As stated below by Rami, the simplest way to do this would to use pandas to either a) .read_json(), or to use pd.DataFrame.from_dict(). however the issue with this particular case is you have nested dictionary/json. What do I mean it's nested? Well, if you were to simply put this into a dataframe, you'd have this:
print (df)
request weather
0 {'type': 'City', 'query': 'Karachi, Pakistan'} {'date': '2019-03-10', 'astronomy': [{'sunrise...
Which is fine if that's what you want. However, I am assuming you'd like all the data/instance flattened into a singe row.
So you'll need to either use json_normalize to unravel it (which is possible, but you'd need to be certain the json file follows the same format/keys throughout. And you'd still need to pull out each of the dictionaries within the list, within the dictionaries. Other option is use some function to flatten out the nested json. Then from there you can simply write to file:
I choose to flatten it using a function, then construct the dataframe:
import pandas as pd
import json
import re
from pandas.io.json import json_normalize
data = {'data': {'request': [{'type': 'City', 'query': 'Karachi, Pakistan'}], 'weather': [{'date': '2019-03-10', 'astronomy': [{'sunrise': '06:46 AM', 'sunset': '06:38 PM', 'moonrise': '09:04 AM', 'moonset': '09:53 PM', 'moon_phase': 'Waxing Crescent', 'moon_illumination': '24'}], 'maxtempC': '27', 'maxtempF': '80', 'mintempC': '22', 'mintempF': '72', 'totalSnow_cm': '0.0', 'sunHour': '11.6', 'uvIndex': '7', 'hourly': [{'time': '24', 'tempC': '27', 'tempF': '80', 'windspeedMiles': '10', 'windspeedKmph': '16', 'winddirDegree': '234', 'winddir16Point': 'SW', 'weatherCode': '116', 'weatherIconUrl': [{'value': 'http://cdn.worldweatheronline.net/images/wsymbols01_png_64/wsymbol_0002_sunny_intervals.png'}], 'weatherDesc': [{'value': 'Partly cloudy'}], 'precipMM': '0.0', 'humidity': '57', 'visibility': '10', 'pressure': '1012', 'cloudcover': '13', 'HeatIndexC': '25', 'HeatIndexF': '78', 'DewPointC': '15', 'DewPointF': '59', 'WindChillC': '24', 'WindChillF': '75', 'WindGustMiles': '12', 'WindGustKmph': '19', 'FeelsLikeC': '25', 'FeelsLikeF': '78', 'uvIndex': '0'}]}]}}
def flatten_json(y):
out = {}
def flatten(x, name=''):
if type(x) is dict:
for a in x:
flatten(x[a], name + a + '_')
elif type(x) is list:
i = 0
for a in x:
flatten(a, name + str(i) + '_')
i += 1
else:
out[name[:-1]] = x
flatten(y)
return out
flat = flatten_json(data['data'])
results = pd.DataFrame()
special_cols = []
columns_list = list(flat.keys())
for item in columns_list:
try:
row_idx = re.findall(r'\_(\d+)\_', item )[0]
except:
special_cols.append(item)
continue
column = re.findall(r'\_\d+\_(.*)', item )[0]
column = column.replace('_', '')
row_idx = int(row_idx)
value = flat[item]
results.loc[row_idx, column] = value
for item in special_cols:
results[item] = flat[item]
results.to_csv('path/filename.csv', index=False)
Output:
print (results.to_string())
type query date astronomy0sunrise astronomy0sunset astronomy0moonrise astronomy0moonset astronomy0moonphase astronomy0moonillumination maxtempC maxtempF mintempC mintempF totalSnowcm sunHour uvIndex hourly0time hourly0tempC hourly0tempF hourly0windspeedMiles hourly0windspeedKmph hourly0winddirDegree hourly0winddir16Point hourly0weatherCode hourly0weatherIconUrl0value hourly0weatherDesc0value hourly0precipMM hourly0humidity hourly0visibility hourly0pressure hourly0cloudcover hourly0HeatIndexC hourly0HeatIndexF hourly0DewPointC hourly0DewPointF hourly0WindChillC hourly0WindChillF hourly0WindGustMiles hourly0WindGustKmph hourly0FeelsLikeC hourly0FeelsLikeF hourly0uvIndex
0 City Karachi, Pakistan 2019-03-10 06:46 AM 06:38 PM 09:04 AM 09:53 PM Waxing Crescent 24 27 80 22 72 0.0 11.6 7 24 27 80 10 16 234 SW 116 http://cdn.worldweatheronline.net/images/wsymb... Partly cloudy 0.0 57 10 1012 13 25 78 15 59 24 75 12 19 25 78 0

Group By with sumproduct

I am working with a df with the following structure:
df = DataFrame({'Date' : ['1', '1', '1', '1'],
'Ref' : ['one', 'one', 'two', 'two'],
'Price' : ['50', '65', '30', '35'],
'MktPrice' : ['63', '63', '32', '32'],
'Quantity' : ['10', '15', '20', '10'],
'MarketQuantity': ['50', '50', '100', '100'],
'Weightings' : ['2', '2', '4', '4'],
'QxWeightings' : ['20', '30', '80', '40'],
'MktQxWeightings': ['100', '100', '400', '400'],
})
I have managed to get the weighted percentage that represents my Quantity out of MarketQuantity, when Price is above Mkt Price (and showing it by Date and Ref)
def percentage(x):
return (x.loc[x['Price'] >= x['MktPrice'], ['QxWeightings']].sum()/(x['MktQxWeightings'].sum()/len(x)))
df.groupby(['Date', 'Ref']).apply(percentage)
Date Ref Output
1 one 0.3
1 two 0.1
However, when I am trying to group it only by Date I get:
Date Output
1 0.4
which is the sum of previous outputs, when it should be 0.14 (30+40)/(100+400).
How can I do that with groupby?
IIUC, may be something like this:
def percentage(x):
return (x.loc[x['Price'] >= x['MktPrice'], ['QxWeightings']].sum()/(x['MktQxWeightings'].sum()/len(x)))
df_new=df.groupby(['Date', 'Ref','MktQxWeightings']).apply(percentage).reset_index()
print(df_new)
Date Ref MktQxWeightings QxWeightings
0 1 one 100 0.3
1 1 two 400 0.1
df_new.groupby('Date')['MktQxWeightings','QxWeightings'].apply(lambda x: x['QxWeightings'].\
cumsum().sum()*100/x['MktQxWeightings'].sum())
Date
1 0.14

Categories