Convert graphlab sframe into a dictionary of {key: values} - python

Given an SFrame as such:
+------+-----------+-----------+-----------+-----------+-----------+-----------+
| X1 | X2 | X3 | X4 | X5 | X6 | X7 |
+------+-----------+-----------+-----------+-----------+-----------+-----------+
| the | -0.060292 | 0.06763 | -0.036891 | 0.066684 | 0.024045 | 0.099091 |
| , | 0.026625 | 0.073101 | -0.027073 | -0.019504 | 0.04173 | 0.038811 |
| . | -0.005893 | 0.093791 | 0.015333 | 0.046226 | 0.032791 | 0.110069 |
| of | -0.050371 | 0.031452 | 0.04091 | 0.033255 | -0.009195 | 0.061086 |
| and | 0.005456 | 0.063237 | -0.075793 | -0.000819 | 0.003407 | 0.053554 |
| to | 0.01347 | 0.043712 | -0.087122 | 0.015258 | 0.08834 | 0.139644 |
| in | -0.019466 | 0.077509 | -0.102543 | 0.034337 | 0.130886 | 0.032195 |
| a | -0.072288 | -0.017494 | -0.018383 | 0.001857 | -0.04645 | 0.133424 |
| is | 0.052726 | 0.041903 | 0.163781 | 0.006887 | -0.07533 | 0.108394 |
| for | -0.004082 | -0.024244 | 0.042166 | 0.007032 | -0.081243 | 0.026162 |
| on | -0.023709 | -0.038306 | -0.16072 | -0.171599 | 0.150983 | 0.042044 |
| that | 0.062037 | 0.100348 | -0.059753 | -0.041444 | 0.041156 | 0.166704 |
| ) | 0.052312 | 0.072473 | -0.02067 | -0.015581 | 0.063368 | -0.017216 |
| ( | 0.051408 | 0.186162 | 0.03028 | -0.048425 | 0.051376 | 0.004989 |
| with | 0.091825 | -0.081649 | -0.087926 | -0.061273 | 0.043528 | 0.107864 |
| was | 0.046042 | -0.058529 | 0.040581 | 0.067748 | 0.053724 | 0.041067 |
| as | 0.025248 | -0.012519 | -0.054685 | -0.040581 | 0.051061 | 0.114956 |
| it | 0.028606 | 0.106391 | 0.025065 | 0.023486 | 0.011184 | 0.016715 |
| by | -0.096704 | 0.150165 | -0.01775 | -0.07178 | 0.004458 | 0.098807 |
| be | -0.109489 | -0.025908 | 0.025608 | 0.076263 | -0.047246 | 0.100489 |
+------+-----------+-----------+-----------+-----------+-----------+-----------+
How can I convert the SFrame into a dictionary such that X1 column is the key and X2 to X7 as the np.array()?
I have tried iterating through the original SFrame row-by-row and do something like this:
>>> import graphlab as gl
>>> import numpy as np
>>> x = gl.SFrame()
>>> a = np.array([1,2,3])
>>> w = 'foo'
>>> x.append(gl.SFrame({'word':[w], 'vector':[a]}))
Columns:
vector array
word str
Rows: 1
Data:
+-----------------+------+
| vector | word |
+-----------------+------+
| [1.0, 2.0, 3.0] | foo |
+-----------------+------+
[1 rows x 2 columns]
Is there another way to do the same?
EDITED
After trying #papayawarrior solution, it works if I can load the whole dataframe into memory but there's a few quriks that makes it odd.
Assuming that my original input to the SFrame is as presented above (with 501 columns) but in .csv file, I have the code to read them into the desired dictionary:
def get_embeddings(embedding_gzip, size):
coltypes = [str] + [float] * size
sf = gl.SFrame.read_csv('compose-vectors/' + embedding_gzip, delimiter='\t', column_type_hints=coltypes, header=False, quote_char='\0')
sf = sf.pack_columns(['X'+str(i) for i in range(2, size+1)])
df = sf.to_dataframe().set_index('X1')
print list(df)
return df.to_dict(orient='dict')['X2']
But oddly it gives this error:
File "sts_compose.py", line 28, in get_embeddings
return df.to_dict(orient='dict')['X2']
KeyError: 'X2'
So when I check for the column names before the conversion to dictionary, I found that my column names are not 'X1' and 'X2' but list(df) prints ['X501', 'X3'].
Is there something wrong with how I have converting the graphlab.SFrame -> pandas.DataFrame -> dict?
I know I can resolve the problem by doing this instead, but the question remains, "How did the column names become so strange?":
def get_embeddings(embedding_gzip, size):
coltypes = [str] + [float] * size
sf = gl.SFrame.read_csv('compose-vectors/' + embedding_gzip, delimiter='\t', column_type_hints=coltypes, header=False, quote_char='\0')
sf = sf.pack_columns(['X'+str(i) for i in range(2, size+1)])
df = sf.to_dataframe().set_index('X1')
col_names = list(df)
return df.to_dict(orient='dict')[col_names[1]]

Is there another way to do the same?
Yes, you can use the pack_columns method from the SFrame class.
import graphlab as gl
data = gl.SFrame()
data.add_column(gl.SArray(['foo', 'bar']), 'X1')
data.add_column(gl.SArray([1., 3.]), 'X2')
data.add_column(gl.SArray([2., 4.]), 'X3')
print data
+-----+-----+-----+
| X1 | X2 | X3 |
+-----+-----+-----+
| foo | 1.0 | 2.0 |
| bar | 3.0 | 4.0 |
+-----+-----+-----+
[2 rows x 3 columns]
import array
data = data.pack_columns(['X2', 'X3'], dtype=array.array, new_column_name='vector')
data = data.rename({'X1':'word'})
print data
+------+------------+
| word | vector |
+------+------------+
| foo | [1.0, 2.0] |
| bar | [3.0, 4.0] |
+------+------------+
[2 rows x 2 columns]
b=data['vector'][0]
print type(b)
<type 'array.array'>
How can I convert the SFrame into a dictionary such that X1 column is the key and X2 to X7 as the np.array()?
I didn't find any built-in method to convert an SFrame to a dict. You could try the following (it might be very slow):
a={}
def dump_sframe_to_dict(row, a):
a[row['word']]=row['vector']
data.apply(lambda x: dump_sframe_to_dict(x, a))
print a
{'foo': array('d', [1.0, 2.0]), 'bar': array('d', [3.0, 4.0])}

Edited to match new questions in the post.
#Adrien Renaud is spot on with the SFrame.pack_columns method, but I would suggest using the Pandas dataframe to_dict for the last question if your dataset fits in memory.
>>> import graphlab as gl
>>> sf = gl.SFrame({'X1': ['cat', 'dog'], 'X2': [1, 2], 'X3': [3, 4]})
>>> sf
+-----+----+----+
| X1 | X2 | X3 |
+-----+----+----+
| cat | 1 | 3 |
| dog | 2 | 4 |
+-----+----+----+
>>> sf2 = sf.rename({'X1': 'word'})
>>> sf2 = sf.pack_columns(column_prefix='X', new_column_name='vector')
>>> sf2
+------+--------+
| word | vector |
+------+--------+
| cat | [1, 3] |
| dog | [2, 4] |
+------+--------+
>>> df = sf2.to_dataframe().set_index('word')
>>> result = df.to_dict(orient='dict')['vector']
>>> result
{'cat': [1, 3], 'dog': [2, 4]}

Related

Looking for a solution to add numeric and float elements stored in list format in one of the columns in dataframe

| Index | col1 |
| -------- | -------------- |
| 0 | [0,0] |
| 2 | [7.9, 11.06] |
| 3 | [0.9, 4] |
| 4 | NAN |
I have data similar to like this.I want to add elements of the list and store it in other column say total using loop such that output looks like this:
| Index | col1 |Total |
| -------- | -------------- | --------|
| 0 | [0,0] |0 |
| 2 | [7.9, 11.06] |18.9 |
| 3 | [0.9, 4] |4.9 |
| 4 | NAN |NAN |
Using na_action parameter in map should work as well:
df['Total'] = df['col1'].map(sum,na_action='ignore')
Use apply with a lambda to sum the lists or return np.NA if the values are not a list:
df['Total'] = df['col1'].apply(lambda x: sum(x) if isinstance(x, list) else pd.NA)
I tried with df.fillna([]), but lists are not a valid parameters of fillna.
Edit: consider using awkward arrays instead of lists: https://awkward-array.readthedocs.io/en/latest/

Creating a groupby column based on ids and category

I have a table below where i need to create a column of "Relevant" and "Non-Relevant" based on IDs.
The table looks like something below:
+----+--------------+--------+
| ID | Experience | Length |
+----+--------------+--------+
| 1 | Relevant | 2 |
| 1 | Non-Relevant | 1 |
| 4 | Relevant | 3 |
| 4 | Relevant | 4 |
| 4 | Non-Relevant | 0 |
| 5 | Relevant | 1 |
| 5 | Relevant | 1 |
+----+--------------+--------+
This is the output I am trying to get
+----+----------+--------------+
| ID | Relevant | Non-Relevant |
+----+----------+--------------+
| 1 | 2 | 1 |
| 4 | 7 | 0 |
| 5 | 2 | 0 |
+----+----------+--------------+
import pandas as pd
df = pd.DataFrame({'id': [1, 1, 4, 4, 4, 5, 5], 'exp': [x for x in 'rnrrnrr'], 'len':[2, 1, 3, 4, 0, 1, 1]})
pd.pivot_table(df, index='id', values='len', columns='exp', aggfunc='sum', fill_value=0)
Documentation: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.pivot_table.html
To create the dataframe:
ID = [1,1,4,4,4,5,5]
Experience = ['Relevant', 'Non-Relevant', 'Relevant', 'Relevant', 'Non-Relevant',
'Relevant', 'Relevant']
length = [2,1,3,4,0,1,1]
dictionary = {'ID' : ID,
'Experience' : Experience,
'Length' : length}
To group it and then unstack:
df.groupby(by=['ID','Experience']).sum().unstack()['Length'].fillna(0)

Printing Lists as Tabular Data, Group Rows

I need to format a data containing as list of lists in a table.
I can make a grid using tabulate:
x = [['Alice', 'min', 2],
['', 'max', 5],
['Bob', 'min', 8],
['', 'max', 15]]
header = ['Name', '', 'value']
print(tabulate.tabulate(x, headers=header, tablefmt="grid"))
+--------+-----+---------+
| Name | | value |
+========+=====+=========+
| Alice | min | 2 |
+--------+-----+---------+
| | max | 5 |
+--------+-----+---------+
| Bob | min | 8 |
+--------+-----+---------+
| | max | 15 |
+--------+-----+---------+
However, we require grouping of rows, like this:
+--------+-----+---------+
| Name | | value |
+========+=====+=========+
| Alice | min | 2 |
+ + + +
| | max | 5 |
+--------+-----+---------+
| Bob | min | 8 |
+ + + +
| | max | 15 |
+--------+-----+---------+
I tried using multiline rows (using "\n".join()), which is apparently supported in tabular 0.8.3, with no success.
This is required to run in the production server, so we can't use any heavy libraries. We are using tabulate because the whole tabulate library is a single file, and we can ship the file with the product.
You can try this:
x = [['Alice', 'min\nmax', '2\n5'],
['Bob', 'min\nmax', '8\n15'],
]
+--------+-----+------------------------+
| Name | | ['value1', 'value2'] |
+========+=====+========================+
| Alice | min | 2 |
| | max | 5 |
+--------+-----+------------------------+
| Bob | min | 8 |
| | max | 15 |
+--------+-----+------------------------+

Python Spark implementing map-reduce algorithm to create (column, value) tuples

UPDATE(04/20/17):
I am using Apache Spark 2.1.0 and I will be using Python.
I have narrowed down the problem and hopefully someone more knowledgeable with Spark can answer. I need to create an RDD of tuples from the header of the values.csv file:
values.csv (main collected data, very large):
+--------+---+---+---+---+---+----+
| ID | 1 | 2 | 3 | 4 | 9 | 11 |
+--------+---+---+---+---+---+----+
| | | | | | | |
| abc123 | 1 | 2 | 3 | 1 | 0 | 1 |
| | | | | | | |
| aewe23 | 4 | 5 | 6 | 1 | 0 | 2 |
| | | | | | | |
| ad2123 | 7 | 8 | 9 | 1 | 0 | 3 |
+--------+---+---+---+---+---+----+
output (RDD):
+----------+----------+----------+----------+----------+----------+----------+
| abc123 | (1;1) | (2;2) | (3;3) | (4;1) | (9;0) | (11;1) |
| | | | | | | |
| aewe23 | (1;4) | (2;5) | (3;6) | (4;1) | (9;0) | (11;2) |
| | | | | | | |
| ad2123 | (1;7) | (2;8) | (3;9) | (4;1) | (9;0) | (11;3) |
+----------+----------+----------+----------+----------+----------+----------+
What happened was I paired each value with the column name of that value in the format:
(column_number, value)
raw format (if you are interested in working with it):
id,1,2,3,4,9,11
abc123,1,2,3,1,0,1
aewe23,4,5,6,1,0,2
ad2123,7,8,9,1,0,3
The Problem:
The example values.csv file contains only a few columns, but in the actual file there are thousands of columns. I can extract the header and broadcast it to every node in the distributed environment, but I am not sure if that is the most efficient way to solve the problem. Is it possible to achieve the output with a parallelized header?
I think you can achieve the solution using PySpark Dataframe too. However, my solution is not optimal yet. I use split to get the new column name and corresponding columns to do sum. This depends on how large is your key_list. If it's too large, this might not work will because you have to load key_list on memory (using collect).
import pandas as pd
import pyspark.sql.functions as func
# example data
values = spark.createDataFrame(pd.DataFrame([['abc123', 1, 2, 3, 1, 0, 1],
['aewe23', 4, 5, 6, 1, 0, 2],
['ad2123', 7, 8, 9, 1, 0, 3]],
columns=['id', '1', '2', '3','4','9','11']))
key_list = spark.createDataFrame(pd.DataFrame([['a', '1'],
['b','2;4'],
['c','3;9;11']],
columns=['key','cols']))
# use values = spark.read.csv(path_to_csv, header=True) for your data
key_list_df = key_list.select('key', func.split('cols', ';').alias('col'))
key_list_rdd = key_list_df.rdd.collect()
for row in key_list_rdd:
values = values.withColumn(row.key, sum(values[c] for c in row.col if c in values.columns))
keys = [row.key for row in key_list_rdd]
output_df = values.select(keys)
Output
output_df.show(n=3)
+---+---+---+
| a| b| c|
+---+---+---+
| 1| 3| 4|
| 4| 6| 8|
| 7| 9| 12|
+---+---+---+

How to convert dict to spark map output

I'm working with spark and python. I would like to transform my input dataset.
My input dataset (RDD)
-------------------------------------------------------------
| id | var |
-------------------------------------------------------------
| 1 |"[{index: 1, value: 200}, {index: 2, value: A}, ...]" |
| 2 |"[{index: 1, value: 140}, {index: 2, value: C}, ...]" |
| .. | ... |
-------------------------------------------------------------
I would like to have this DataFrame (output dataset)
----------------------
| id | index | value |
----------------------
| 1 | 1 | 200 |
| 1 | 2 | A |
| 1 | ... | ... |
| 2 | 1 | 140 |
| 2 | 2 | C |
| ...| ... | ... |
----------------------
I create a map function
def process(row):
my_dict = {}
for item in row['value']:
my_dict['id'] = row['id']
my_dict['index'] = item['index']
my_dict['value'] = item['value']
return my_dict
I would like to map my process function like this:
output_rdd = input_rdd.map(process)
Is it possible to do this on this way (or a simpler way)?
I found the solution:
output_rdd = input_rdd.map(lambda row:process(row)).flatMap(lambda x: x)

Categories