Converting CSV file data into federated data - python

I am trying to convert my CSV dataset into a federated data. Please find the code and the error I am getting while I am running my code
code: import collections
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_federated as tff
np.random.seed(0)
df = pd.read_csv('path to my csv file')
client_id_colname = 'aratio: continuous.'
SHUFFLE_BUFFER = 1000
NUM_EPOCHS = 1
client_ids = df[client_id_colname].unique()
train_client_ids = sample(client_ids.tolist(),500)
test_client_ids = [x for x in client_ids if x not in train_client_ids]
def create_tf_dataset_for_client_fn(client_id):
client_data = df[df[client_id_colname] == client_id]
dataset = tf.data.Dataset.from_tensor_slices(client_data.to_dict('list'))
dataset = dataset.shuffle(SHUFFLE_BUFFER).batch(1).repeat(NUM_EPOCHS)
return dataset
train_data = tff.simulation.ClientData.from_clients_and_fn(
client_ids=train_client_ids,
create_tf_dataset_for_client_fn=create_tf_dataset_for_client_fn
)
test_data = tff.simulation.ClientData.from_clients_and_fn(
client_ids=test_client_ids,
create_tf_dataset_for_client_fn=create_tf_dataset_for_client_fn
)
Error: ---------------------------------------------------------------------------
NameError Traceback (most recent call last)
<ipython-input-7-9d85508920a8> in <module>
15 # split client id into train and test clients
16 client_ids = df[client_id_colname].unique()
---> 17 train_client_ids = sample(client_ids.tolist(),500)
18 test_client_ids = [x for x in client_ids if x not in train_client_ids]
19
NameError: name 'sample' is not defined

Python cannot find the sample function. The code will need to import it from somewhere, a few possible options:
random.sample
numpy.random.sample
To use the first, the code would need an import random and the sample line would need to change to:
train_client_ids = random.sample(client_ids.tolist(), 500)

add the following line in the list of your import statements:
from random import sample

Related

Want to use the output created from first def as input variables for another second def within class in python

Here I have created the output x from first def(null_checking) function and want to use the same output (x) as input for second def(variance) within class. I tried a lot but couldnot.
#import dependencies
#import dependencies
from optparse import Values
from re import X
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
import pandas as pd
from .prac import checking_null
#import datasets
datasets = pd.read_csv('Car_sales.csv')
datasets
features = ['Fuel_efficiency', 'Power_perf_factor', 'Engine_size', 'Horsepower', 'Fuel_capacity', 'Curb_weight']
features
class extraction():
def __init__(self, datasets, features):
self.features = features
self.datasets = datasets
#checking and removing null rows present in the dataframe
def null_checking(self):
datasets1 = self.datasets[self.features]
print(datasets1)
for items, x in enumerate(datasets1):
if items =='True':
continue
x = datasets1.dropna(axis=0)
print(x)
#calculating variance inflation factorss
def variance(self):
x.inner_display()
#we need intercept for calculating variance inflation factor
x['intercept'] = 1
print(x)
#Making the new dataframe
df = pd.DataFrame()
df['variables'] = x.columns
df['VIF'] = [variance_inflation_factor(x, i) for i in range(x.shape[1])]
print(df)

TypeError: '>' not supported between instances of 'numpy.ndarray' and 'str' error

#im working on an assignment on a covid 19 dataset and using the apriori algorithm to analyze it however i keep having problems with it and im VERY bad at programming
#i imported these libraries first
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
!pip install efficient_apriori
#load the data
from google.colab import files
uploaded = files.upload()
#read the file and display it
df = pd.read_csv('covid_dataset.csv')
df
#did this to turn the dataframe to a tuple
records = df.to_records(index=False)
result = list(records)
#here is the problem
from efficient_apriori import apriori
min_support = 7/len(result)
min_confidence = 0
itemsets, rules = apriori(result, min_support=min_support, min_confidence=min_confidence)
#here is the error
TypeError Traceback (most recent call last)
<ipython-input-17-e29e44a028ef> in <module>()
6 # For now set min_confidence = 0 to obtain all the rules
7 min_confidence = 0
----> 8 itemsets, rules = apriori(result, min_support=min_support, min_confidence=min_confidence)
1 frames
/usr/local/lib/python3.7/dist-packages/efficient_apriori/itemsets.py in itemsets_from_transactions(transactions, min_support, max_length, verbosity, output_transaction_ids)
312 # Retrieve the itemsets of the previous size, i.e. of size k - 1
313 # They must be sorted to maintain the invariant when joining/pruning
--> 314 itemsets_list = sorted(item for item in large_itemsets[k - 1].keys())
315
316 # Gen candidates of length k + 1 by joining, prune, and copy as set
TypeError: '>' not supported between instances of 'numpy.ndarray' and 'str'

Apriori Algorithm in Data Mining - How to resolve TyperError regarding the TransactionEncoder() in python?

I am trying to incorporate the apriori algorithm in a python program, but I have a TypeError for the line ‘te_ary = te.fit(dataset).transform(dataset)’. I believe it has something to do with the fact that I am reading my dataset from my computer, as opposed to manually typing it into jupyter notebook. I thought it might have dealt with my variables in the line where I declared ‘frequent_itemsets’, but the error is from line 3?
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from apyori import apriori
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
filename = '/Users/emitsch/Documents/Database 1.csv'
#loading the excel spreadsheet file with my database
dataset = pd.read_csv(filename, header = None)
te = TransactionEncoder()
te_ary = te.fit(dataset).transform(dataset)
df = pd.DataFrame(te_ary, columns=te.columns_)
frequent_itemsets = apriori(df, min_support=0.6, use_colnames=True)
And this is the error:
TypeError Traceback (most recent call last)
<ipython-input-19-ff180148a5c5> in <module>
1 te = TransactionEncoder()
----> 2 te_ary = te.fit(dataset).transform(dataset)
3 df = pd.DataFrame(te_ary, columns=te.columns_)
4 frequent_itemsets = apriori(df, min_support=0.6, use_colnames=True)
//anaconda3/lib/python3.7/site-packages/mlxtend/preprocessing/transactionencoder.py in fit(self, X)
54 unique_items = set()
55 for transaction in X:
---> 56 for item in transaction:
57 unique_items.add(item)
58 self.columns_ = sorted(unique_items)
TypeError: 'int' object is not iterable

IndexError: index 255 is out of bounds for axis 2 with size 12. What does it mean and how to solve it?

I currently used the keras segnet from https://github.com/0bserver07/Keras-SegNet-Basic and no problem running it. But when i change to my own dataset i get the index error problem. I am quite new in python and artificial intelligence field so i hope anyone can help me out. I searched the other similar problem and solution but still doesn't get it.
This is the error:
Traceback (most recent call last):
File "C:/Users/ZC/PycharmProjects/segnet(crack)/model.py", line 35, in <module>
train_data, train_label = load_data("train2")
File "C:/Users/ZC/PycharmProjects/segnet(crack)/model.py", line 30, in load_data
label.append(one_hot_it(cv2.imread(txt[i][1][:-1])[:,:,0]))
File "C:\Users\ZC\PycharmProjects\segnet(crack)\helper.py", line 29, in one_hot_it
x[i,j,labels[i][j]]=1
IndexError: index 255 is out of bounds for axis 2 with size 12
This is the code (model.py):
from __future__ import absolute_import
from __future__ import print_function
import cv2
import numpy as np
import itertools
from helper import *
import os
# Copy the data to this dir here in the SegNet project /CamVid from here:
# https://github.com/alexgkendall/SegNet-Tutorial
DataPath = 'C:/Keras-SegNet/SegNet/'
data_shape = 360*480
def load_data(mode):
data = []
label = []
with open(DataPath + mode +'.txt') as f:
txt = f.readlines()
txt = [line.split(' ') for line in txt]
for i in range(len(txt)):
print(txt[i][0])
print(txt[i][1][:-1])
img=cv2.imread( txt[i][1][:-1])
cv2.imshow('image',img)
data.append(np.rollaxis(normalized(cv2.imread( txt[i][0])),2))
label.append(one_hot_it(cv2.imread(txt[i][1][:-1])[:,:,0]))
print('.',end='')
return np.array(data), np.array(label)
train_data, train_label = load_data("train2")
train_label = np.reshape(train_label,(300,data_shape,2))
test_data, test_label = load_data("test")
test_label = np.reshape(test_label,(233,data_shape,2))
np.save("train_data", train_data)
np.save("train_label", train_label)
np.save("test_data", test_data)
np.save("test_label", test_label)
This is the helper.py file code:
from __future__ import absolute_import
from __future__ import print_function
import cv2
import numpy as np
import itertools
from helper import *
import os
def normalized(rgb):
#return rgb/255.0
norm=np.zeros((rgb.shape[0], rgb.shape[1], 3),np.float32)
b=rgb[:,:,0]
g=rgb[:,:,1]
r=rgb[:,:,2]
norm[:,:,0]=cv2.equalizeHist(b)
norm[:,:,1]=cv2.equalizeHist(g)
norm[:,:,2]=cv2.equalizeHist(r)
return norm
def one_hot_it(labels):
x = np.zeros([360,480,12])
for i in range(360):
for j in range(480):
x[i,j,labels[i][j]]=1
return x
Can anyone also explain what does the function does for the helper.py especially this line x[i,j,labels[i][j]]=1.
Here you can see that the x array as a size of 12 in the 3rd axis when creating it: np.zeros([360,480,12]). Therefore, whenever you are accessing something on the 3rd axis, you need to make sure that it is between 0 and 11.
In this case, labels[i][j] has a value of 255 at some point, so it doesn't work. This is because you use one_hot_it with an image instead of labels (one_hot_it(cv2.imread(txt[i][1][:-1])[:,:,0])).

Convert DF into Numpy Array for calculations

I have the data in a dataframe format that I will use for linear regression calculation using user-built function. Here is the code:
from sklearn.datasets import load_boston
boston = load_boston()
bos = pd.DataFrame(boston.data) # convert to DF
bos.columns = boston.feature_names
bos['PRICE'] = boston.target
y = bos.PRICE
x = bos.drop('PRICE', axis = 1) # DROP PRICE since only want X-type variables (not Y-target)
xw = df.to_array(x)
xw = np.insert(xw,0,1, axis = 1) # to insert a column of "1" values
However, I am getting the error:
AttributeError Traceback (most recent call last)
<ipython-input-131-272f1b4d26ba> in <module>()
1 import copy
2
----> 3 xw = df.to_array(x)
AttributeError: 'int' object has no attribute 'to_array'
I am not sure where the problem. I need to pass an array of values (x in this case) to the function to execute some matrix operations
The insert function was working in a step by step code development but for some reason is failing here.
I tried:
xw = copy.deepcopy(x)
with no success
Any thoughts?
it is x.as_matrix() not df.to_array(x)
Please refer to pandas document for more detail on as_matrix()
Here is the code that work
from sklearn.datasets import load_boston
import pandas as pd
import numpy as np
boston = load_boston()
bos = pd.DataFrame(boston.data) # convert to DF
bos.columns = boston.feature_names
bos['PRICE'] = boston.target
y = bos.PRICE
x = bos.drop('PRICE', axis = 1) # DROP PRICE since only want X-type variables (not Y-target)
xw = x.as_matrix()
xw = np.insert(xw,0,1, axis = 1) # to insert a column of "1" values

Categories