Related
I have a dictionary like this:
my_dict = {'RuleSet': {'0': {'RuleSetID': '0',
'RuleSetName': 'Allgemein',
'Rules': [{'RulesID': '10',
'RuleName': 'Gemeinde Seiten',
'GroupHits': '2',
'KeyWordGroups': ['100', '101', '102']}]},
'1': {'RuleSetID': '1',
'RuleSetName': 'Portale Berlin',
'Rules': [{'RulesID': '11',
'RuleName': 'Portale Berlin',
'GroupHits': '4',
'KeyWordGroups': ['100', '101', '102', '107']}]},
'6': {'RuleSetID': '6',
'RuleSetName': 'Zwangsvollstr. Berlin',
'Rules': [{'RulesID': '23',
'RuleName': 'Zwangsvollstr. Berlin',
'GroupHits': '1',
'KeyWordGroups': ['100', '101']}]}}}
When using this code snippet it can be transformed into a dataframe:
rules_pd = pd.DataFrame(my_dict['RuleSet'])
rules_pd
The result is:
I would like to make it look like this:
Does anyone know how to tackle this challenge?
Doing from_dict with index
out = pd.DataFrame.from_dict(my_dict['RuleSet'],'index')
Out[692]:
RuleSetID ... Rules
0 0 ... [{'RulesID': '10', 'RuleName': 'Gemeinde Seite...
1 1 ... [{'RulesID': '11', 'RuleName': 'Portale Berlin...
6 6 ... [{'RulesID': '23', 'RuleName': 'Zwangsvollstr....
[3 rows x 3 columns]
#out.columns
#Out[693]: Index(['RuleSetID', 'RuleSetName', 'Rules'], dtype='object')
You could try use Transpose()
rules_pd = pd.DataFrame(my_dict['RuleSet']).transpose()
print(rules_pd)
I am using a weather API that responses with a JSON file. Here is a sample of the returned readings:
{
'data': {
'request': [{
'type': 'City',
'query': 'Karachi, Pakistan'
}],
'weather': [{
'date': '2019-03-10',
'astronomy': [{
'sunrise': '06:46 AM',
'sunset': '06:38 PM',
'moonrise': '09:04 AM',
'moonset': '09:53 PM',
'moon_phase': 'Waxing Crescent',
'moon_illumination': '24'
}],
'maxtempC': '27',
'maxtempF': '80',
'mintempC': '22',
'mintempF': '72',
'totalSnow_cm': '0.0',
'sunHour': '11.6',
'uvIndex': '7',
'hourly': [{
'time': '24',
'tempC': '27',
'tempF': '80',
'windspeedMiles': '10',
'windspeedKmph': '16',
'winddirDegree': '234',
'winddir16Point': 'SW',
'weatherCode': '116',
'weatherIconUrl': [{
'value': 'http://cdn.worldweatheronline.net/images/wsymbols01_png_64/wsymbol_0002_sunny_intervals.png'
}],
'weatherDesc': [{
'value': 'Partly cloudy'
}],
'precipMM': '0.0',
'humidity': '57',
'visibility': '10',
'pressure': '1012',
'cloudcover': '13',
'HeatIndexC': '25',
'HeatIndexF': '78',
'DewPointC': '15',
'DewPointF': '59',
'WindChillC': '24',
'WindChillF': '75',
'WindGustMiles': '12',
'WindGustKmph': '19',
'FeelsLikeC': '25',
'FeelsLikeF': '78',
'uvIndex': '0'
}]
}]
}
}
I used the following Python code in my attempt to reading the data stored in JSON file:
import simplejson as json
data_file = open("new.json", "r")
values = json.load(data_file)
But this outputs with an error as follows:
JSONDecodeError: Expecting value: line 1 column 1 (char 0) error
I am also wondering how I can save the result in a structured format in a CSV file using Python.
As stated below by Rami, the simplest way to do this would to use pandas to either a) .read_json(), or to use pd.DataFrame.from_dict(). however the issue with this particular case is you have nested dictionary/json. What do I mean it's nested? Well, if you were to simply put this into a dataframe, you'd have this:
print (df)
request weather
0 {'type': 'City', 'query': 'Karachi, Pakistan'} {'date': '2019-03-10', 'astronomy': [{'sunrise...
Which is fine if that's what you want. However, I am assuming you'd like all the data/instance flattened into a singe row.
So you'll need to either use json_normalize to unravel it (which is possible, but you'd need to be certain the json file follows the same format/keys throughout. And you'd still need to pull out each of the dictionaries within the list, within the dictionaries. Other option is use some function to flatten out the nested json. Then from there you can simply write to file:
I choose to flatten it using a function, then construct the dataframe:
import pandas as pd
import json
import re
from pandas.io.json import json_normalize
data = {'data': {'request': [{'type': 'City', 'query': 'Karachi, Pakistan'}], 'weather': [{'date': '2019-03-10', 'astronomy': [{'sunrise': '06:46 AM', 'sunset': '06:38 PM', 'moonrise': '09:04 AM', 'moonset': '09:53 PM', 'moon_phase': 'Waxing Crescent', 'moon_illumination': '24'}], 'maxtempC': '27', 'maxtempF': '80', 'mintempC': '22', 'mintempF': '72', 'totalSnow_cm': '0.0', 'sunHour': '11.6', 'uvIndex': '7', 'hourly': [{'time': '24', 'tempC': '27', 'tempF': '80', 'windspeedMiles': '10', 'windspeedKmph': '16', 'winddirDegree': '234', 'winddir16Point': 'SW', 'weatherCode': '116', 'weatherIconUrl': [{'value': 'http://cdn.worldweatheronline.net/images/wsymbols01_png_64/wsymbol_0002_sunny_intervals.png'}], 'weatherDesc': [{'value': 'Partly cloudy'}], 'precipMM': '0.0', 'humidity': '57', 'visibility': '10', 'pressure': '1012', 'cloudcover': '13', 'HeatIndexC': '25', 'HeatIndexF': '78', 'DewPointC': '15', 'DewPointF': '59', 'WindChillC': '24', 'WindChillF': '75', 'WindGustMiles': '12', 'WindGustKmph': '19', 'FeelsLikeC': '25', 'FeelsLikeF': '78', 'uvIndex': '0'}]}]}}
def flatten_json(y):
out = {}
def flatten(x, name=''):
if type(x) is dict:
for a in x:
flatten(x[a], name + a + '_')
elif type(x) is list:
i = 0
for a in x:
flatten(a, name + str(i) + '_')
i += 1
else:
out[name[:-1]] = x
flatten(y)
return out
flat = flatten_json(data['data'])
results = pd.DataFrame()
special_cols = []
columns_list = list(flat.keys())
for item in columns_list:
try:
row_idx = re.findall(r'\_(\d+)\_', item )[0]
except:
special_cols.append(item)
continue
column = re.findall(r'\_\d+\_(.*)', item )[0]
column = column.replace('_', '')
row_idx = int(row_idx)
value = flat[item]
results.loc[row_idx, column] = value
for item in special_cols:
results[item] = flat[item]
results.to_csv('path/filename.csv', index=False)
Output:
print (results.to_string())
type query date astronomy0sunrise astronomy0sunset astronomy0moonrise astronomy0moonset astronomy0moonphase astronomy0moonillumination maxtempC maxtempF mintempC mintempF totalSnowcm sunHour uvIndex hourly0time hourly0tempC hourly0tempF hourly0windspeedMiles hourly0windspeedKmph hourly0winddirDegree hourly0winddir16Point hourly0weatherCode hourly0weatherIconUrl0value hourly0weatherDesc0value hourly0precipMM hourly0humidity hourly0visibility hourly0pressure hourly0cloudcover hourly0HeatIndexC hourly0HeatIndexF hourly0DewPointC hourly0DewPointF hourly0WindChillC hourly0WindChillF hourly0WindGustMiles hourly0WindGustKmph hourly0FeelsLikeC hourly0FeelsLikeF hourly0uvIndex
0 City Karachi, Pakistan 2019-03-10 06:46 AM 06:38 PM 09:04 AM 09:53 PM Waxing Crescent 24 27 80 22 72 0.0 11.6 7 24 27 80 10 16 234 SW 116 http://cdn.worldweatheronline.net/images/wsymb... Partly cloudy 0.0 57 10 1012 13 25 78 15 59 24 75 12 19 25 78 0
I am working with a df with the following structure:
df = DataFrame({'Date' : ['1', '1', '1', '1'],
'Ref' : ['one', 'one', 'two', 'two'],
'Price' : ['50', '65', '30', '35'],
'MktPrice' : ['63', '63', '32', '32'],
'Quantity' : ['10', '15', '20', '10'],
'MarketQuantity': ['50', '50', '100', '100'],
'Weightings' : ['2', '2', '4', '4'],
'QxWeightings' : ['20', '30', '80', '40'],
'MktQxWeightings': ['100', '100', '400', '400'],
})
I have managed to get the weighted percentage that represents my Quantity out of MarketQuantity, when Price is above Mkt Price (and showing it by Date and Ref)
def percentage(x):
return (x.loc[x['Price'] >= x['MktPrice'], ['QxWeightings']].sum()/(x['MktQxWeightings'].sum()/len(x)))
df.groupby(['Date', 'Ref']).apply(percentage)
Date Ref Output
1 one 0.3
1 two 0.1
However, when I am trying to group it only by Date I get:
Date Output
1 0.4
which is the sum of previous outputs, when it should be 0.14 (30+40)/(100+400).
How can I do that with groupby?
IIUC, may be something like this:
def percentage(x):
return (x.loc[x['Price'] >= x['MktPrice'], ['QxWeightings']].sum()/(x['MktQxWeightings'].sum()/len(x)))
df_new=df.groupby(['Date', 'Ref','MktQxWeightings']).apply(percentage).reset_index()
print(df_new)
Date Ref MktQxWeightings QxWeightings
0 1 one 100 0.3
1 1 two 400 0.1
df_new.groupby('Date')['MktQxWeightings','QxWeightings'].apply(lambda x: x['QxWeightings'].\
cumsum().sum()*100/x['MktQxWeightings'].sum())
Date
1 0.14
Say I have a dataset with a variable, lines, that looks like this:
lines = ['QA7uiXy8vIbUSPOkCf9RwQ3FsT8jVq2OxDr8zqa7bRQ=', '1', '10', '38', '0.0', '9', '20050407', '20170319', '0', '0', '0', '0', '1', '1', '281.6']
['QA7uiXy8vIbUSPOkCf9RwQ3FsT8jVq2OxDr8zqa7bRQ=', '1', '10', '38', '0.0', '9', '20050407', '20170319', '0', '0', '0', '0', '1', '1', '281.6']
['QA7uiXy8vIbUSPOkCf9RwQ3FsT8jVq2OxDr8zqa7bRQ=', '1', '10', '38', '0.0', '9', '20050407', '20170319', '0', '0', '0', '0', '1', '1', '281.6']
['QA7uiXy8vIbUSPOkCf9RwQ3FsT8jVq2OxDr8zqa7bRQ=', '1', '10', '38', '0.0', '9', '20050407', '20170319', '0', '0', '0', '0', '1', '1', '281.6']
How do I, if and only if lines[0] == lines[0], meaning only if the first element of the list is the exact same, average specific values in the rest of the list, and combine that into one, averaged list? Of course, I will have to convert all numbers into floats.
In the specific example, I want a singular list, where all the numeric values besides lines[1] and lines[-1] are averaged. Any easy way?
Expected output
['QA7uiXy8vIbUSPOkCf9RwQ3FsT8jVq2OxDr8zqa7bRQ=', 1, avg_of_var, avg_of_var, avg, , '20050407', '20170319', '0', '0', '0', '0', '1', '1', '281.6']
Basically - and I see now that my example data is unfortunate as all values are the same - but I want a singular list containing an average of the numeric values of the four lines in the example.
You can use pandas to create a dataframe. You can then group by lines[0] and then aggregate by mean (for desired columns only). However, you also need to specify aggregation method for other columns as well. I will assume, you also need the mean for these columns.
import pandas as pd
from numpy import mean
lines = [['QA7uiXy8vIbUSPOkCf9RwQ3FsT8jVq2OxDr8zqa7bRQ=', 1, 10, 38, 0.0, 9,
20050407, 20170319, 0, 0, 0, 0, 1, 1, 281.6],
['QA7uiXy8vIbUSPOkCf9RwQ3FsT8jVq2OxDr8zqa7bRQ=', 1, 10, 38, 0.0, 9,
20050407, 20170319, 0, 0, 0, 0, 1, 1, 281.6],
['QA7uiXy8vIbUSPOkCf9RwQ3FsT8jVq2OxDr8zqa7bRQ=', 1, 10, 38, 0.0, 9,
20050407, 20170319, 0, 0, 0, 0, 1, 1, 281.6],
['QA7uiXy8vIbUSPOkCf9RwQ3FsT8jVq2OxDr8zqa7bRQ=', 1, 10, 38, 0.0, 9,
20050407, 20170319, 0, 0, 0, 0, 1, 1, 281.6]]
# I have removed the quotes around numbers for simplification but this can also be handled by pandas.
# create a data frame and give names to your fields.
# Here 'KEY' is the name of the first field we will use for grouping
df = pd.DataFrame(lines,columns=['KEY','a','b','c','d','e','f','g','h','i','j','k','l','m','n'])
This yields something like this:
KEY a b c d e f g h i j k l m n
0 QA7uiXy8vIbUSPOkCf9RwQ3FsT8jVq2OxDr8zqa7bRQ= 1 10 38 0.0 9 20050407 20170319 0 0 0 0 1 1 281.6
1 QA7uiXy8vIbUSPOkCf9RwQ3FsT8jVq2OxDr8zqa7bRQ= 1 10 38 0.0 9 20050407 20170319 0 0 0 0 1 1 281.6
2 QA7uiXy8vIbUSPOkCf9RwQ3FsT8jVq2OxDr8zqa7bRQ= 1 10 38 0.0 9 20050407 20170319 0 0 0 0 1 1 281.6
3 QA7uiXy8vIbUSPOkCf9RwQ3FsT8jVq2OxDr8zqa7bRQ= 1 10 38 0.0 9 20050407 20170319 0 0 0 0 1 1 281.6
This is the operation you are looking for:
data = df.groupby('KEY',as_index=False).aggregate(mean)
This yields:
KEY a b c d e f g h i j k l m n
0 QA7uiXy8vIbUSPOkCf9RwQ3FsT8jVq2OxDr8zqa7bRQ= 1 10 38 0.0 9 20050407 20170319 0 0 0 0 1 1 281.6
You can specify the aggregation type by field by using a dictionary (assuming 'mean' for every field):
data = df.groupby('KEY',as_index=False).aggregate({'a':mean,'b':mean,'c':mean,'d':mean,'e':mean,'f':mean,'g':mean,'h':mean,'i':mean,'j':mean,'k':mean,'l':mean,'m':mean,'n':mean})
More information about groupby can be found here: http://pandas.pydata.org/pandas-docs/stable/generated/pandas.core.groupby.DataFrameGroupBy.agg.html
will this simple python snippet works
# I am assuming lines is a list of line
lines = [['QA7uiXy8vIbUSPOkCf9RwQ3FsT8jVq2OxDr8zqa7bRQ=', '1', '10', '38', '0.0', '9', '20050407', '20170319', '0', '0', '0', '0', '1', '1', '281.6'],
['QA7uiXy8vIbUSPOkCf9RwQ3FsT8jq2OxDr8zqa7bRQ=', '1', '10', '38', '0.0', '9', '20050407', '20170319', '0', '0', '0', '0', '1', '1', '281.6'],
['QA7uiXy8vIbUSPOkCf9RwQ3FsT8jq2OxDr8zqa7bRQ=', '1', '10', '38', '0.0', '9', '20050407', '20170319', '0', '0', '0', '0', '1', '1', '281.6'],
['QA7uiXy8vIbUSPOkCf9RwQ3FsT8jVq2OxDr8zqa7bRQ=', '1', '10', '38', '0.0', '9', '20050407', '20170319', '0', '0', '0', '0', '1', '1', '281.6']]
# I am gonna use dict to distinct line[0] as key
# will keep adding to dict , if first time
# otherwise add all the values to corresponding index
# also keep track of number of lines to find out avg at last
average = {}
for line in lines:
# first time just enter data to dict
# and initialise qty as 1
if line[0] not in average:
average[line[0]] = {
'data': line,
'qty' : 1
}
continue
add column data after type conversion to float
i = 1
while i < len(line):
average[line[0]]['data'][i] = float(average[line[0]]['data'][i]) + float(line[i])
i+=1
average[line[0]]['qty'] += 1;
# now create another list of required lines
merged_lines = []
for key in average:
line = []
line.append(key)
# this is to calculate average
for element in average[key]['data'][1:]:
line.append(element/average[key]['qty'])
merged_lines.append(line)
print merged_lines
I'm unsuccessfully trying to convert list of lists to a custom dictionary.
I've created the following output saved in two lists:
headers = ['CPU', 'name', 'id', 'cused', 'callc', 'mused', 'mallc']
result = [['1/0', 'aaa', '10', '0.1', '15', '10.73', '16.00'],
['1/0', 'bbb', '10', '0.1', '20', '11.27', '14.00'],
['1/0', 'ccc', '10', '0.2', '10', '11.50', '15.00'],
['1/0', 'aaa', '10', '1.1', '15', '15.10', '23.00']]
Formatted output:
headers:
slot name id cused callc mused mallc
result:
1/0 aaa 10 0.1 15 10.73 16.00
2/0 bbb 25 0.1 20 11.39 14.00
1/0 ccc 10 0.2 10 11.50 15.00
1/0 aaa 10 1.1 15 15.10 23.00
The first n columns (3 in this case) should be used to concatenate key name with all of the remaining columns as output values.
I would like to convert it to a dictionary in the following format:
slot.<slot>.name.<name>.id.<id>.cused:<value>,
slot.<slot>.name.<name>.id.<id>.callc:<value>,
slot.<slot>.name.<name>.id.<id>.mused:<value>,
slot.<slot>.name.<name>.id.<id>.mallc:<value>,
...
for example:
dictionary = {
'slot.1/0.name.aaa.id.10.cused':10,
'slot.1/0.name.aaa.id.25.callc':15,
'slot.1/0.name.aaa.id.10.mused':10.73,
'slot.1/0.name.aaa.id.10.mallc':16.00,
'slot.2/0.name.bbb.id.10.cused':0.1,
...
'slot.<n>.name.<name>.id.<id>.<value_name> <value>
}
Can you show me how that can be done?
Updated - OP added raw lists
Now that you have updated the question to show the raw list it's even easier:
headers = ['CPU', 'name', 'id', 'cused', 'callc', 'mused', 'mallc']
result = [['1/0', 'aaa', '10', '0.1', '15', '10.73', '16.00'],
['1/0', 'bbb', '10', '0.1', '20', '11.27', '14.00'],
['1/0', 'ccc', '10', '0.2', '10', '11.50', '15.00'],
['1/0', 'aaa', '10', '1.1', '15', '15.10', '23.00']]
results = {}
for r in result:
slot, name, _id = r[:3]
results.update(
{'slot.{}.name.{}.id.{}.{}'.format(slot, name, _id, k) : v
for k, v in zip(headers[3:], r[3:])})
>>> from pprint import pprint
>>> pprint(results)
{'slot.1/0.name.aaa.id.10.callc': '15',
'slot.1/0.name.aaa.id.10.cused': '1.1',
'slot.1/0.name.aaa.id.10.mallc': '23.00',
'slot.1/0.name.aaa.id.10.mused': '15.10',
'slot.1/0.name.bbb.id.10.callc': '20',
'slot.1/0.name.bbb.id.10.cused': '0.1',
'slot.1/0.name.bbb.id.10.mallc': '14.00',
'slot.1/0.name.bbb.id.10.mused': '11.27',
'slot.1/0.name.ccc.id.10.callc': '10',
'slot.1/0.name.ccc.id.10.cused': '0.2',
'slot.1/0.name.ccc.id.10.mallc': '15.00',
'slot.1/0.name.ccc.id.10.mused': '11.50'}
Original file based answer
The following code will construct the required dictionary (results). The idea is that each non-header line in the file is split by whitespace into fields, and the fields are used in a dictionary comprehension to construct a dictionary for each line, which is then used to update the results dictionary.
with open('data') as f:
# skip the 3 header lines
for i in range(3):
_ = next(f)
STAT_NAMES = 'cused callc mused mallc'.split()
results = {}
for line in f:
line = line.split()
slot, name, _id = line[:3]
results.update(
{'slot.{}.name.{}.id.{}.{}'.format(slot, name, _id, k) : v
for k, v in zip(STAT_NAMES, line[3:])})
Output
>>> from pprint import pprint
>>> pprint(results)
{'slot.1/0.name.aaa.id.10.callc': '15',
'slot.1/0.name.aaa.id.10.cused': '1.1',
'slot.1/0.name.aaa.id.10.mallc': '23.00',
'slot.1/0.name.aaa.id.10.mused': '15.10',
'slot.1/0.name.ccc.id.10.callc': '10',
'slot.1/0.name.ccc.id.10.cused': '0.2',
'slot.1/0.name.ccc.id.10.mallc': '15.00',
'slot.1/0.name.ccc.id.10.mused': '11.50',
'slot.2/0.name.bbb.id.25.callc': '20',
'slot.2/0.name.bbb.id.25.cused': '0.1',
'slot.2/0.name.bbb.id.25.mallc': '14.00',
'slot.2/0.name.bbb.id.25.mused': '11.39'}
try this, Note: i changed "slot" instead of "CPU"
headers = ['slot', 'name', 'id', 'cused', 'callc', 'mused', 'mallc']
result = [['1/0', 'aaa', '10', '0.1', '15', '10.73', '16.00'],
['1/0', 'bbb', '10', '0.1', '20', '11.27', '14.00'],
['1/0', 'ccc', '10', '0.2', '10', '11.50', '15.00'],
['1/0', 'aaa', '10', '1.1', '15', '15.10', '23.00']]
#I get: [['1/0', '1/0', '1/0', '1/0'], ['aaa', 'bbb', 'ccc', 'aaa'], ....
transpose_result = map(list, zip(*result))
#I get: {'slot': ['1/0', '1/0', '1/0', '1/0'],
# 'mallc': ['16.00', '14.00', '15.00', '23.00'], ...
data = dict(zip(headers, transpose_result))
d = {}
for reg in ("cused", "callc", "mused", "mallc"):
for i, val in enumerate(data[reg]):
key = []
for reg2 in ("slot", "name", "id"):
key.append(reg2)
key.append(data[reg2][i])
key.append(reg)
d[".".join(key)] = val
you get in d
{
'slot.1/0.name.bbb.id.10.cused': '0.1',
'slot.1/0.name.aaa.id.10.cused': '1.1',
'slot.1/0.name.bbb.id.10.callc': '20',
'slot.1/0.name.aaa.id.10.mallc': '23.00',
'slot.1/0.name.aaa.id.10.callc': '15',
'slot.1/0.name.ccc.id.10.mallc': '15.00',
'slot.1/0.name.ccc.id.10.mused': '11.50',
'slot.1/0.name.aaa.id.10.mused': '15.10',
'slot.1/0.name.ccc.id.10.cused': '0.2',
'slot.1/0.name.ccc.id.10.callc': '10',
'slot.1/0.name.bbb.id.10.mallc': '14.00',
'slot.1/0.name.bbb.id.10.mused': '11.27'}
import itertools
headers = 'slot name id cused callc mused mallc'.split()
result = ['1/0 aaa 10 0.1 15 10.73 16.00'.split(),
'2/0 bbb 25 0.1 20 11.39 14.00'.split()]
key_len = 3
d = {}
for row in result:
key_start = '.'.join(itertools.chain(*zip(headers, row[:key_len])))
for key_end, val in zip(headers[key_len:], row[key_len:]):
d[key_start + '.' + key_end] = val
another solution with the correct type for cused, callc, mused and mallc
labels = ['slot','name','id','cused','callc','mused','mallc']
data = ['1/0 aaa 10 0.1 15 10.73 16.00',
'2/0 bbb 25 0.1 20 11.39 14.00',
'1/0 ccc 10 0.2 10 11.50 15.00',
'1/0 aaa 10 1.1 15 15.10 23.00']
data = [tuple(e.split()) for e in data]
data = [zip(labels, e) for e in data]
results = dict()
for e in data:
s = '%s.%s.%s' % tuple(['.'.join(e[i]) for i in range(3)])
for i in range(3,7):
results['%s.%s' % (s, e[i][0])] = int(e[i][1]) if i == 4 else float(e[i][1])
print results