I have the following code
import pandas as pd
from pandas_datareader import data as web
import numpy as np
import math
data = web.DataReader('goog', 'yahoo')
df['lifetime'] = data['High'].asfreq('D').rolling(window=999999, min_periods=1).max() #To check if it is a lifetime high
How can i compare it so that i get a boolean (in 1 and 0 preferably) if df['High'] is close to its df['lifetime'] for each row in pandas:
data['isclose'] = math.isclose(data['High'], data['lifetime'], rel_tol = 0.003)
Any help would be appreciated.
You can use np.where()
import numpy as np
import math
data['isclose'] = np.where(math.isclose(data['High'], data['lifetime'], rel_tol = 0.003), 1, 0)
You could also use pandas' apply() function:
import math
from pandas_datareader import data as web
data = web.DataReader("goog", "yahoo")
data["lifetime"] = data["High"].asfreq("D").rolling(window=999999, min_periods=1).max()
data["isclose"] = data.apply(
lambda row: 1 if math.isclose(row["High"], row["lifetime"], rel_tol=0.003) else 0,
axis=1,
)
print(data)
However, yudhiesh's solution using np.where() is faster.
See also: Why is np.where faster than pd.apply
The number of rows in my data frame for similar filters is different and I cannot figure why. Here is my code -
import numpy as np
import pandas as pd
df = pd.read_csv("Automobile_price_data_clean-f18.csv")
df
df.loc[(df['body-style']== 'hatchback') & df['city-mpg']]
a = df.loc[(df['body-style']== 'hatchback') & df['city-mpg']]
foo_1 = a.count()
b = df.loc[(df['body-style']== 'hatchback')]
foo_2 = b.count()
foo_1 == foo_2
Here is my data - https://paste.pythondiscord.com/apizixigay.apache
Surely the queries are not the same.
a = df.loc[(df['body-style']== 'hatchback') & df['city-mpg']]# incorporates city-mpg and hence restrictive. To check further try;
a.shape versus b.shape
and
a['city-mpg'].nunique() versus b['city-mpg'].nunique()
I have a pandas series that contains:
[-3.86932793e+02 1.82297039e+01 -5.80108624e+01 3.60803151e+00\n -2.23173279e+01 -1.61694102e+01 -1.91569713e+01 -9.71229354e+00\n 1.04943316e+00 -2.32231360e+00 -1.40624006e+01 -7.31842760e+00\n 9.68115460e+00 2.42948531e+01 5.64715091e+00 2.08459357e+00\n -8.29193170e+00 -5.98514877e+00 -5.60237828e+00 5.11533863e+00\n 4.24665522e+00 2.44113892e+00 -9.27428068e-01 2.42668658e+00\n -1.29403291e+00 -6.17909507e+00 3.12809650e+00 8.99939129e+00\n 8.94010048e+00 8.05541832e+00 5.60370916e+00 -6.52764019e+00\n -9.95711382e+00 -2.02809827e-01 2.57034145e+00 -3.20973926e+00\n -9.36473473e+00 -2.29672003e+00 1.43961641e+00 6.63567513e+00]
How do I turn this into an array I can use for sklearn?
You can call .tolist() on series in pandas to give you a list object.
Since you will use it with sklearn, you most likely want a numpy array, therefore you could also call .to_numpy() which returns a np.array instead.
You can implement this part of code.
import pandas as pd
import numpy as np
str_ = '[-3.86932793e+02 1.82297039e+01 -5.80108624e+01 3.60803151e+00\n
-2.23173279e+01 -1.61694102e+01 -1.91569713e+01 -9.71229354e+00\n
1.04943316e+00 -2.32231360e+00 -1.40624006e+01 -7.31842760e+00\n
9.68115460e+00 2.42948531e+01 5.64715091e+00 2.08459357e+00\n
-8.29193170e+00 -5.98514877e+00 -5.60237828e+00 5.11533863e+00\n
4.24665522e+00 2.44113892e+00 -9.27428068e-01 2.42668658e+00\n
-1.29403291e+00 -6.17909507e+00 3.12809650e+00 8.99939129e+00\n
8.94010048e+00 8.05541832e+00 5.60370916e+00 -6.52764019e+00\n
-9.95711382e+00 -2.02809827e-01 2.57034145e+00 -3.20973926e+00\n
-9.36473473e+00 -2.29672003e+00 1.43961641e+00 6.63567513e+00]'
str_ = str_.replace('\n', '').replace(']', '').replace('[', '')
str_.split(' ')
array = [float(value) for value in str_.split(" ")]
I have all the data (sites and distances already).
Now I have to form a string matrix to use as an input for another python script.
I have sites and distances as (returned from a query, delimited as here):
A|B|5
A|C|3
A|D|9
B|C|7
B|D|2
C|D|6
How to create this kind of matrix?
A|B|C|D
A|0|5|3|9
B|5|0|7|2
C|3|7|0|6
D|9|2|6|0
This has to be returned as a string from python and I'll have more than 1000 sites, so it should be optimized for such size.
Thanks
I have no doubt it could be done in a cleaner way (because Python).
I will do some more research later on but I do want you to have something to start with, so here it is.
import pandas as pd
data = [
('A','B',5)
,('A','C',3)
,('A','D',9)
,('B','C',7)
,('B','D',2)
,('C','D',6)
]
data.extend([(y,x,val) for x,y,val in data])
df = pd.DataFrame(data, columns=['x','y','val'])
df = df.pivot_table(values='val', index='x', columns='y')
df = df.fillna(0)
Here is a demo for 1000x1000 (take about 2 seconds)
import pandas as pd, itertools as it
data = [(x,y,val) for val,(x,y) in enumerate(it.combinations(range(1000),2))]
data.extend([(y,x,val) for x,y,val in data])
df = pd.DataFrame(data, columns=['x','y','val'])
df = df.pivot_table(values='val', index='x', columns='y')
df = df.fillna(0)
I have a Numpy array consisting of a list of lists, representing a two-dimensional array with row labels and column names as shown below:
data = array([['','Col1','Col2'],['Row1',1,2],['Row2',3,4]])
I'd like the resulting DataFrame to have Row1 and Row2 as index values, and Col1, Col2 as header values
I can specify the index as follows:
df = pd.DataFrame(data,index=data[:,0]),
however I am unsure how to best assign column headers.
You need to specify data, index and columns to DataFrame constructor, as in:
>>> pd.DataFrame(data=data[1:,1:], # values
... index=data[1:,0], # 1st column as index
... columns=data[0,1:]) # 1st row as the column names
edit: as in the #joris comment, you may need to change above to np.int_(data[1:,1:]) to have correct data type.
Here is an easy to understand solution
import numpy as np
import pandas as pd
# Creating a 2 dimensional numpy array
>>> data = np.array([[5.8, 2.8], [6.0, 2.2]])
>>> print(data)
>>> data
array([[5.8, 2.8],
[6. , 2.2]])
# Creating pandas dataframe from numpy array
>>> dataset = pd.DataFrame({'Column1': data[:, 0], 'Column2': data[:, 1]})
>>> print(dataset)
Column1 Column2
0 5.8 2.8
1 6.0 2.2
I agree with Joris; it seems like you should be doing this differently, like with numpy record arrays. Modifying "option 2" from this great answer, you could do it like this:
import pandas
import numpy
dtype = [('Col1','int32'), ('Col2','float32'), ('Col3','float32')]
values = numpy.zeros(20, dtype=dtype)
index = ['Row'+str(i) for i in range(1, len(values)+1)]
df = pandas.DataFrame(values, index=index)
This can be done simply by using from_records of pandas DataFrame
import numpy as np
import pandas as pd
# Creating a numpy array
x = np.arange(1,10,1).reshape(-1,1)
dataframe = pd.DataFrame.from_records(x)
>>import pandas as pd
>>import numpy as np
>>data.shape
(480,193)
>>type(data)
numpy.ndarray
>>df=pd.DataFrame(data=data[0:,0:],
... index=[i for i in range(data.shape[0])],
... columns=['f'+str(i) for i in range(data.shape[1])])
>>df.head()
[![array to dataframe][1]][1]
Here simple example to create pandas dataframe by using numpy array.
import numpy as np
import pandas as pd
# create an array
var1 = np.arange(start=1, stop=21, step=1).reshape(-1)
var2 = np.random.rand(20,1).reshape(-1)
print(var1.shape)
print(var2.shape)
dataset = pd.DataFrame()
dataset['col1'] = var1
dataset['col2'] = var2
dataset.head()
Adding to #behzad.nouri 's answer - we can create a helper routine to handle this common scenario:
def csvDf(dat,**kwargs):
from numpy import array
data = array(dat)
if data is None or len(data)==0 or len(data[0])==0:
return None
else:
return pd.DataFrame(data[1:,1:],index=data[1:,0],columns=data[0,1:],**kwargs)
Let's try it out:
data = [['','a','b','c'],['row1','row1cola','row1colb','row1colc'],
['row2','row2cola','row2colb','row2colc'],['row3','row3cola','row3colb','row3colc']]
csvDf(data)
In [61]: csvDf(data)
Out[61]:
a b c
row1 row1cola row1colb row1colc
row2 row2cola row2colb row2colc
row3 row3cola row3colb row3colc
I think this is a simple and intuitive method:
data = np.array([[0, 0], [0, 1] , [1, 0] , [1, 1]])
reward = np.array([1,0,1,0])
dataset = pd.DataFrame()
dataset['StateAttributes'] = data.tolist()
dataset['reward'] = reward.tolist()
dataset
returns:
But there are performance implications detailed here:
How to set the value of a pandas column as list
It's not so short, but maybe can help you.
Creating Array
import numpy as np
import pandas as pd
data = np.array([['col1', 'col2'], [4.8, 2.8], [7.0, 1.2]])
>>> data
array([['col1', 'col2'],
['4.8', '2.8'],
['7.0', '1.2']], dtype='<U4')
Creating data frame
df = pd.DataFrame(i for i in data).transpose()
df.drop(0, axis=1, inplace=True)
df.columns = data[0]
df
>>> df
col1 col2
0 4.8 7.0
1 2.8 1.2