I want to get the statistics of a long column, but I have the problems that in the colomn are diffrent datas(A,B,C,D..) and the same values (2) that I will count.
Example:
A
2
2
2
2
B
2
2
C
D
E
2
2
Output will be like:
A 4
B 2
C
D
E 2
Check where the Series, s, equals your magic number. Form groups after masking by that same check, but forward filling.
u = s.eq('2') # `2` if it's not a string
u.groupby(s.mask(u).ffill()).sum()
A 4.0
B 2.0
C 0.0
D 0.0
E 2.0
dtype: float64
Input data:
import pandas as pd
s = pd.Series(list('A2222B22CDE22'))
I am assuming that we are working with a text file. ('test_input.txt')
import pandas as pd
data = pd.read_csv('test_input.txt', header=None)
data = list(data[0])
final_out = dict()
last_item = None
for item in data:
try:
item = int(item)
except ValueError:
item = str(item)
if isinstance(item, str):
last_item = item
final_out[last_item] = 0
if isinstance(item, int):
final_out[last_item] += 1
print(final_out)
## {'A': 4, 'B': 2, 'C': 0, 'D': 0, 'E': 2}
print(pd.DataFrame.from_dict(final_out, orient='index'))
## 0
## A 4
## B 2
## C 0
## D 0
## E 2
# For order column, create first.
dataframe = dataframe.rename(columns={0:'unique'})
print(dataframe)
# Ordering
dataframe = dataframe.sort_values(by=['unique'])
print(dataframe)
Related
want to list out the 1| 2| 3| in separate column (a) it the first items seperated by ; and sum of all related number with another column (d)
data structure
1|1=89325|2=96682|3=81940 267947
2|1=17162|2=21282|3=23033; 61477
3|1=71761|2=73375|3=83581; 228717
coding
a = {'TAG': ';1|1=89325|2=96682|3=81940;2|1=17162|2=21282|3=23033;3|1=71761|2=73375|3=83581'}
parsed_data = re.findall(r'([\d.]+)=([\d.]+)', a['TAG'])
a = ','.join(str(dict(zip(['a', 'b', 'c'], i))) for i in parsed_data)
a = pd.DataFrame(eval(a))
a
code output
a b
0 1 89325
1 2 96682
2 3 81940
3 1 17162
4 2 21282
5 3 23033
6 1 71761
7 2 73375
8 3 83581
expected output
a b c d
0 1 1 89325
1 1 2 96682
2 1 3 81940 267947
3 2 1 17162
4 2 2 21282
5 2 3 23033 61477
6 3 1 71761
7 3 2 73375
8 3 3 83581 228717
Solution which doesn't use regex but it generates the expected output:
import pandas as pd
from itertools import chain
a = {'TAG': '1###;1|1=89325|2=96682|3=81940;2|1=17162|2=21282|3=23033;3|1=71761|2=73375|3=83581'}
list_of_lists = []
# Transform string into list of lists of tuples
for row in a['TAG'].split(';'):
content = tuple(row.split('|'))
row_num = int(content[0][0])
if len(content) > 1:
list_of_lists.append([tuple([row_num] + [int(elem) for elem in c.split('=')]) for c in content[1:]])
# Calculate the total for each list
totals = [sum([values[-1] for values in t]) for t in list_of_lists]
# Append the total to the last tuple in each list
for index, elem in enumerate(list_of_lists):
elem[-1] = (*elem[-1], totals[index])
# Flatten the list_of_lists structure
data = list(chain(*list_of_lists))
# Create the dataframe
df = pd.DataFrame(data, columns =['a', 'b', 'c', 'd'])
UPDATE: Solution where map function is applied to input parsed using regular expression:
import re
from itertools import chain
import pandas as pd
a = {'TAG': ';1|1=89325|2=96682|3=81940;2|1=17162|2=21282|3=23033;3|1=71761|2=73375|3=83581'}
def list_of_tuples(tup):
indexes = [(0,1,2), (0,3,4), (0,5,6)]
lot = [tuple([int(tup[i]) for i in index]) for index in indexes]
lot[-1] = (*lot[-1], sum([int(tup[i]) for i in (2,4,6)]))
return lot
regex = r"\;(\d+)\|(\d+)\=(\d+)\|(\d+)\=(\d+)\|(\d+)\=(\d+)"
data = list(chain(*map(list_of_tuples, re.findall(regex, a['TAG']))))
df = pd.DataFrame(data, columns =['a', 'b', 'c', 'd'])
UPDATE 2: A more efficient map function, although the code is more verbose.
def list_of_tuples(tup):
values_total = 0
tuples_list = []
for index,elem in enumerate(tup):
if index == 0:
a = int(elem)
else:
if index%2 == 1:
b = int(elem)
else:
c = int(elem)
values_total += c
tuples_list.append((a, b, c))
tuples_list[-1] = (*tuples_list[-1], values_total)
return tuples_list
I want to transform a matrix to 3D arrays, or transforming a 3D arrays to a matrix. How to input the data and how to do the transformation work in Python?
I've searched for many places, but there is no answer. please help me
matrix a:
a b c
d 1 2 3
e 2 3 4
f 4 3 2
array b:
a d 1
a e 2
a f 4
b d 2
b e 3
b f 3
c d 3
c e 4
c f 2
can i use stack() to achieve my goal?
like: Python pandas - pd.melt a dataframe with datetime index results in NaN
So your data is not actually 3 dimensional, but 2 dimensional. You are essentially trying to unpivot your 2d data. This is often called melt. Your best option is to load the data into a pandas data frame.
import pandas as pd
df = pd.DataFrame([['d',1,2,3],['e',2,3,4],['f',4,3,2]], columns=['idx','a','b','c'])
df
# returns:
idx a b c
0 d 1 2 3
1 e 2 3 4
2 f 4 3 2
pd.melt(df, id_vars='index', value_vars=list('abc'))
# returns:
idx variable value
0 d a 1
1 e a 2
2 f a 4
3 d b 2
4 e b 3
5 f b 3
6 d c 3
7 e c 4
8 f c 2
I'm not very familiar with the pandas library but here is a rough solution using the python standard library:
#!/usr/bin/env python2
"""
Convert a matrix to 2D arrays and vice versa
http://stackoverflow.com/questions/43289673
"""
from collections import OrderedDict
TEST_MATRIX = """\
a b c
d 1 2 3
e 2 3 4
f 4 3 2
"""
def parse_matrix(matrix_string):
"""Parse a matrix string and return list of tuples representing data"""
matrix_string = matrix_string.strip()
list_of_lines = matrix_string.splitlines()
parsed_list = []
y_headers = list_of_lines[0].split()
data_rows = [i.split() for i in list_of_lines[1:]]
for y in y_headers:
for row in data_rows:
parsed_list.append((y, row[0], row[y_headers.index(y) + 1]))
return parsed_list
def convert_to_matrix(data):
"""
Convert a parsed matrix (in the form of a list of tuples) to a matrix
(string)
"""
# Messes up ordering
# y_headers = set(i[0] for i in data)
# x_headers = set(i[1] for i in data)
y_headers = OrderedDict()
x_headers = OrderedDict()
[(y_headers.setdefault(i[0]), x_headers.setdefault(i[1])) for i in data]
matrix_string = " " + " ".join(y_headers) # header
for x in x_headers:
row = [x]
for y in y_headers:
val = [i[-1] for i in data if i[0] == y and i[1] == x][0]
row.append(val)
row_string = " ".join(row)
matrix_string += "\n" + row_string
return matrix_string
def main():
print("Test matrix:")
print(TEST_MATRIX)
# parse the test matrix string to a list of tuples
parsed_test_matrix = parse_matrix(TEST_MATRIX)
# print the parsed matrix
print("Parsed matrix:")
for row in parsed_test_matrix:
print " ".join(row)
print
# convert parsed matrix back to the original matrix and print
print("Convert parsed matrix back to matrix:")
print(convert_to_matrix(parsed_test_matrix))
if __name__ == "__main__":
main()
For example I would like to get letters indicating a row where period of at least two consecutive drops in other column begins.
Exemplary data:
a b
0 3 a
1 2 b
2 3 c
3 2 d
4 1 e
5 0 f
6 -1 g
7 3 h
8 1 i
9 0 j
Exemplary solution with simple loop:
import pandas as pd
df = pd.DataFrame({'a': [3,2,3,2,1,0,-1,3,1,0], 'b': list('abcdefghij')})
less = 0
l = []
prev_prev_row = df.iloc[0]
prev_row = df.iloc[1]
if prev_row['a'] < prev_prev_row['a']: less = 1
for i, row in df.iloc[2:len(df)].iterrows():
if row['a'] < prev_row['a']:
less = less + 1
else:
less = 0
if less == 2:
l.append(prev_prev_row['b'])
prev_prev_row = prev_row
prev_row = row
This gives list l:
['c', 'h']
Here's one approach with some help from NumPy and Scipy -
from scipy.ndimage.morphology import binary_closing
arr = df.a.values
mask1 = np.hstack((False,arr[1:] < arr[:-1],False))
mask2 = mask1 & (~binary_closing(~mask1,[1,1]))
final_mask = mask2[1:] > mask2[:-1]
out = list(df.b[final_mask])
use rolling(2) in reverse
s = df.a[::-1].diff().gt(0).rolling(2).sum().eq(2)
df.b.loc[s & (s != s.shift(-1))]
2 c
7 h
Name: b, dtype: object
if you actually wanted a list
df.b.loc[s & (s != s.shift(-1))].tolist()
['c', 'h']
I can convert all text features in a pandas dataframe by casting to 'category' using the df.astype() method as below. However I find category hard to work with (eg for plotting data) and would prefer to create a new column of integers
#convert all objects to categories
object_types = dataset.select_dtypes(include=['O'])
for col in object_types:
dataset['{0}_category'.format(col)] = dataset[col].astype('category')
I can convert the text to integers using this hack:
#convert all objects to int values
object_types = dataset.select_dtypes(include=['O'])
new_cols = {}
for col in object_types:
data_set = set(dataset[col].tolist())
data_indexed = {}
for i, item in enumerate(data_set):
data_indexed[item] = i
new_list = []
for item in dataset[col].tolist():
new_list.append(data_indexed[item])
new_cols[col]=new_list
for key, val in new_cols.items():
dataset['{0}_int_value'.format(key)] = val
But is there a better (or existing) way to do the same?
I would use factorize method, which is designed for this particular task:
In [90]: x
Out[90]:
A B
9 c z
10 c z
4 b x
5 b y
1 a w
7 b z
In [91]: x.apply(lambda col: pd.factorize(col, sort=True)[0])
Out[91]:
A B
9 2 3
10 2 3
4 1 1
5 1 2
1 0 0
7 1 3
or:
In [92]: x.apply(lambda col: pd.factorize(col)[0])
Out[92]:
A B
9 0 0
10 0 0
4 1 1
5 1 2
1 2 3
7 1 0
consider df
df = pd.DataFrame(dict(A=list('aaaabbbbcccc'),
B=list('wwxxxyyzzzzz')))
df
you can convert to integers like this
def intify(s):
u = np.unique(s)
i = np.arange(len(u))
return s.map(dict(zip(u, i)))
or shorter version
def intify(s):
u = np.unique(s)
return s.map({k: i for i, k in enumerate(u)})
df.apply(intify)
Or in a single line
df.apply(lambda s: s.map({k:i for i,k in enumerate(s.unique())}))
How to count the frequency of numbers given in a text file. The text file is as follows.
0
2
0
1
0
1
55
100
100
I want the output as follows
0 3
1 2
2 1
55 1
100 2
I tried this without success
def histogram( A, flAsList=False ):
"""Return histogram of values in array A."""
H = {}
for val in A:
H[val] = H.get(val,0) + 1
if flAsList:
return H.items()
return H
Any better way. Thanks in advance!
Use Counter. It's the best way for this type of problems
from collections import Counter
with open('file.txt', 'r') as fd:
lines = fd.read().split()
counter = Counter(lines)
# sorts items
items = sorted(counter.items(), key=lambda x: int(x[0]))
# prints desired output
for k, repetitions in items:
print k,'\t', repetitions
The output:
0 3
1 2
2 1
55 1
100 2
Use a Counter object for this:
from collections import Counter
c = Counter(A)
Now the c variable will hold a frequency map of each of the values. For instance:
Counter(['a', 'b', 'c', 'a', 'c', 'a'])
=> Counter({'a': 3, 'c': 2, 'b': 1})
Please consider using update:
def histogram( A, flAsList=False ):
"""Return histogram of values in array A."""
H = {}
for val in A:
# H[val] = H.get(val,0) + 1
if H.has_key(val):
H[val] = H[val] + 1
else:
H.update({val : 1})
if flAsList:
return H.items()
return H
Simple approach using a dictionary:
histogram = {}
with open("file","r") as f:
for line in f:
try:
histogram[line.strip()] +=1
except KeyError:
histogram[line.strip()] = 1
for key in sorted(histogram.keys(),key=int):
print key,"\t",histogram[key]
Output:
0 3
1 2
2 1
55 1
100 2
Edit:
To select a specific column you'd want to split the line using split(). For example the sixth field by splitting on a single space:
try:
histogram[line.strip().split(' ')[5]] +=1
except KeyError:
histogram[line.strip().split(' ')[5]] = 1