AttributeError: 'DataFrame' object has no attribute 'seek' - python

In the DataProcessor class, the raw_file_processing, dataset_csv, classes_csv, and idset_csv functions process the raw datafile and output csv files that can be read by the read_archive function.
My code raised AttributeError: 'DataFrame' object has no attribute 'seek' error.
import pandas as pd
import warnings
import numpy as np
import os
import zipfile
import re
from sklearn.model_selection import train_test_split
class DataProcesser:
def __init__(self, archive_path, col_id='ID', col_class='class', col_classname='class_name', col_set='set',
read_on_init=True, **kwargs):
self.archive_path = archive_path
self.archive = zipfile.ZipFile(self.archive_path, 'r')
self.col_id = col_id
self.col_class = col_class
self.col_classname = col_classname
self.col_set = col_set
self.dataset = None
self.dataset_cropped = None
self.id_set = None
self.classes = None
self.train_set = None
self.validation_set = None
self.test_set = None
self.logs = []
self.stats = None
self.flag_subset = False
self.flag_process = False
self.flag_split = False
self.measurement_df = None
if read_on_init:
self.read_archive(**kwargs)
def raw_file_processing(self):
# If the path contains HTAN CODEX data, perform the following processing steps
if os.path.isdir(archive_path):
# 'Class' refers to the independent variable
# The class info is the 3rd column tile_num in the current example
# The rationale for looking at tile_num is that if we're examining tumor progression, we can observe the relative positions of the tumor growth
# Tumor progression may be denoted by the corresponding values of tumor progression markers/antibodies such as CEA
# In the future, we may append all the tumor patient files and normal patient files and then assign patient number as "class"
self.col_classname = self.archive_path.iloc[2]
# Dummy-code the classes
self.col_class = pd.get_dummies(self.col_classname)
# Create the ID series by concatenating columns 1-3
self.col_id = self.archive_path.assign(
ID=self.archive_path[['cell_id:cell_id', 'region:region', 'tile_num:tile_num']].apply(
lambda row: '_'.join([str(each) for each in row]), axis=1))
self.col_id = self.archive_path.drop(columns=['cell_id:cell_id', 'region:region', 'tile_num:tile_num'])
# Obtain measurement info
# Normalize data against blank/empty columns
# log-transform the data
for col in self.archive_path[9:]:
if re.findall(r"Blank|Empty", col):
background = col
else:
for index, row in col:
norm_data = row / background
self.measurement_df = np.log2(norm_data)
return self.archive_path, self.col_id, self.col_class, self.measurement_df
def dataset_csv(self):
# If the path contains HTAN CODEX data, perform the following processing steps
if os.path.isdir(self.archive_path):
"""Col 1: ID
Col 2: class
Col 3-n: measurements"""
id_col = self.col_id
self.col_class = self.col_class.to_frame()
frames = [id_col, self.col_class, self.measurement_df]
self.dataset = pd.concat(frames)
data_csv = self.dataset.to_csv("../input_data/dataset.csv")
return data_csv
def classes_csv(self):
# If the path contains HTAN CODEX data, perform the following processing steps
if os.path.isdir(self.archive_path):
# Remove any duplicate rows with the same col_class and cls_col info
self.cls_df = pd.DataFrame({'class': [self.col_class], 'class_name': [self.col_classname]})
self.cls_df.drop_duplicate(keep=False, inplace=True)
# Save as csv file
self.cls_df.to_csv('../input_data/classes.csv')
return self.cls_df
def idset_csv(self):
# If the path contains HTAN CODEX data, perform the following processing steps
if os.path.isdir(self.archive_path):
# Get the ids
ids = self.archive_path[0]
# Train-test-validation split
ids.sample(frac=1)
train, test = train_test_split(ids, test_size=0.2, random_state=1)
train, val = train_test_split(train, test_size=0.25, random_state=1)
# Assuming train, val, test are dataframes
# A string is assigned to the "set" column.
train.loc[:, 'set'] = 'train'
val.loc[:, 'set'] = 'val'
test.loc[:, 'set'] = 'test'
# Save as csv file
id_set = pd.concat([train, val, test], axis=0)
id_set_csv = id_set.to_csv('../input_data/id_set.csv', index=False)
return id_set_csv
def zip_files(self):
# Create a ZipFile object for dataset.csv, classes.csv, and id_set.csv
zip = ZipFile("data.zip", "w")
zip.write("dataset.csv")
zip.write("classes.csv")
zip.write("id_set.csv")
zip.close()
return zip
def read_archive(self, datatable=True, **kwargs):
"""
Read a zip archive, without extraction, than contains:
* data as .csv, observations in rows, measurements in columns. Names of columns must have the format:
A_1, A_2, A_3,..., C_1, C_2,... where A and C are groups (sensors) and 1,2,3... measurement time
* IDs of training/validation/test as .csv
* Explicit name of classes as .csv
:return: 2 pandas, one with raw data, one with IDs
"""
if datatable:
try:
from datatable import fread
self.dataset = fread(self.archive.open('dataset.csv'), **kwargs).to_pandas()
self.id_set = fread(self.archive.open('id_set.csv'), **kwargs).to_pandas()
self.classes = fread(self.archive.open('classes.csv'), **kwargs).to_pandas()
except ModuleNotFoundError:
warnings.warn('datatable module not found, using pandas instead. To prevent this message from appearing'
' use "datatable = False" when reading the archive.')
self.dataset = pd.read_csv(self.archive.open('dataset.csv'))
self.id_set = pd.read_csv(self.archive.open('id_set.csv'))
self.classes = pd.read_csv(self.archive.open('classes.csv'))
else:
self.dataset = pd.read_csv(self.archive.open('dataset.csv'))
self.id_set = pd.read_csv(self.archive.open('id_set.csv'))
self.classes = pd.read_csv(self.archive.open('classes.csv'))
self.check_datasets()
self.logs.append('Read archive: {0}'.format(self.archive_path))
return None
input_path = "//wsl$/Ubuntu-20.04/home/melissachua/CODEX/input_data"
# Open all the subfolders within path
for root, dirs, files in os.walk(input_path):
for file in files:
with open(os.path.join(root, file), "r") as data:
data_file = pd.read_csv(data)
data = DataProcesser(data_file, datatable=False)
meas_var = None
start_time = None
end_time = None
# Open all the subfolders within path
for root, dirs, files in os.walk(input_path):
for file in files:
with open(os.path.join(root, file), "r") as data:
data_file = pd.read_csv(data)
# The data object is used to automatically derive some parameters (e.g. number of classes)
data = DataProcesser(data_file, datatable=False)
Traceback
> Traceback (most recent call last): File
> "C:/Users/User/PycharmProjects/CODEX/main.py", line 171, in <module>
> data = DataProcesser(data_file, datatable=False) File "C:/Users/User/PycharmProjects/CODEX/main.py", line 16, in __init__
> self.archive = zipfile.ZipFile(self.archive_path, 'r') File "C:\Users\User\AppData\Local\Programs\Python\Python38\lib\zipfile.py",
> line 1269, in __init__
> self._RealGetContents() File "C:\Users\User\AppData\Local\Programs\Python\Python38\lib\zipfile.py",
> line 1332, in _RealGetContents
> endrec = _EndRecData(fp) File "C:\Users\User\AppData\Local\Programs\Python\Python38\lib\zipfile.py",
> line 264, in _EndRecData
> fpin.seek(0, 2) File "C:\Users\User\PycharmProjects\CODEX\venv\lib\site-packages\pandas\core\generic.py",
> line 5487, in __getattr__
> return object.__getattribute__(self, name) AttributeError: 'DataFrame' object has no attribute 'seek'
>
> Process finished with exit code 1

The error you are getting is from the zipfile.ZipFile call. You should pass (the path to) a .zip file to your constructor, not a pandas DataFrame.

In your code you have the following lines:
data_file = pd.read_csv(data)
data = DataProcesser(data_file, datatable=False)
With the first line you are reading a csv file into a DataFrame and storing this DataFrame in a variable data_file.
The second line uses this DataFrame as input for your DataProcesser constructor. The constructor, however, is defined as follows:
def __init__(self, archive_path, col_id='ID', col_class='class', col_classname='class_name', col_set='set',
read_on_init=True, **kwargs):
You are passing your DataFrame as archive_path, which is not what your constructor is expecting. The constructor is expecting a str as file name (also a file would be appropriate) for example for the zipfile constructor or the os functions. See Zipfile documnetation for example.
Therefore, store your DataFrame in another variable and use the archive_path for the file path. Unfortunately, you have mixed up the DataFrame instance and the archive_path multiple times in your code. Here are a few examples.
# Constrcutor
self.archive = zipfile.ZipFile(self.archive_path, 'r')
...
# First method
if os.path.isdir(archive_path):
...
self.col_classname = self.archive_path.iloc[2]
...
for col in self.archive_path[9:]:

Related

I am getting this error - raise ValueError("Unsupported predictor value: %d"%ft) TypeError: %d format: a real number is required, not bytes

I am trying to extract texts from PDF and compare the info, finally saving it as an excel file. But while I am running it, (the code is given below), I get the error. I have provided the whole Traceback.
`
import pdfminer
import pandas as pd
from time import sleep
from tqdm import tqdm
from itertools import chain
import slate
# List of pdf files to process
pdf_files = ['file1.pdf', 'file2.pdf']
# Create a list to store the text from each PDF
pdf1_text = []
pdf2_text = []
# Iterate through each pdf file
for pdf_file in tqdm(pdf_files):
# Open the pdf file
with open(pdf_file, 'rb') as pdf_now:
# Extract text using slate
text = slate.PDF(pdf_now)
text = text[0].split('\n')
if pdf_file == pdf_files[0]:
pdf1_text.append(text)
else:
pdf2_text.append(text)
sleep(20)
pdf1_text = list(chain.from_iterable(pdf1_text))
pdf2_text = list(chain.from_iterable(pdf2_text))
differences = set(pdf1_text).symmetric_difference(pdf2_text)
## Create a new dataframe to hold the differences
differences_df = pd.DataFrame(columns=['pdf1_text', 'pdf2_text'])
# Iterate through the differences and add them to the dataframe
for difference in differences:
# Create a new row in the dataframe with the difference from pdf1 and pdf2
differences_df = differences_df.append({'pdf1_text': difference if difference in pdf1_text else '',
'pdf2_text': difference if difference in pdf2_text else ''}, ignore_index=True)
# Write the dataframe to an excel sheet
differences_df = differences_df.applymap(lambda x: x.encode('unicode_escape').decode('utf-8') if isinstance(x, str) else x)
differences_df.to_excel('differences.xlsx', index=False, engine='openpyxl')
import openpyxl
import re
# Load the Excel file into a dataframe
df = pd.read_excel("differences.xlsx")
# Create a condition to check the number of words in each cell
for column in ["pdf1_text", "pdf2_text"]:
df[f"{column}_word_count"] = df[column].str.split().str.len()
condition = df[f"{column}_word_count"] < 10
# Drop the rows that meet the condition
df = df[~condition]
for column in ["pdf1_text", "pdf2_text"]:
df = df.drop(f"{column}_word_count", axis=1)
# Save the modified dataframe to a new Excel file
df.to_excel("differences.xlsx", index=False)
This is my code, and below is the error which I am getting. Listing the whole traceback below -
Traceback (most recent call last):
File "c:\Users\lmohandas\stuff\1801pdfs\slatetrial.py", line 22, in <module>
text = slate.PDF(pdf_now)
File "C:\Users\lmohandas\AppData\Local\Programs\Python\Python310\lib\site-packages\slate\classes.py", line 61, in __init__
self.doc = PDFDocument(self.parser, password)
File "C:\Users\lmohandas\AppData\Local\Programs\Python\Python310\lib\site-packages\pdfminer\pdfdocument.py", line 558, in __init__
self.read_xref_from(parser, pos, self.xrefs)
File "C:\Users\lmohandas\AppData\Local\Programs\Python\Python310\lib\site-packages\pdfminer\pdfdocument.py", line 789, in read_xref_from
xref.load(parser)
File "C:\Users\lmohandas\AppData\Local\Programs\Python\Python310\lib\site-packages\pdfminer\pdfdocument.py", line 242, in load
self.data = stream.get_data()
File "C:\Users\lmohandas\AppData\Local\Programs\Python\Python310\lib\site-packages\pdfminer\pdftypes.py", line 292, in get_data
self.decode()
File "C:\Users\lmohandas\AppData\Local\Programs\Python\Python310\lib\site-packages\pdfminer\pdftypes.py", line 283, in decode
data = apply_png_predictor(pred, colors, columns, bitspercomponent, data)
File "C:\Users\lmohandas\AppData\Local\Programs\Python\Python310\lib\site-packages\pdfminer\utils.py", line 46, in apply_png_predictor
raise ValueError("Unsupported predictor value: %d"%ft)
TypeError: %d format: a real number is required, not bytes

How to insert a data frame as an object attribute

This is most likely a pretty basic question, but I am still learning about classes/objects/constructors/etc. and I am trying to apply some of these concepts to my current workflow.
I am trying to create a class that automatically saves my data frame as a CSV or xlsx file, depending on what I specify, to a given folder. However, I don't believe that I am correctly passing my data frame as an object attribute. This is my code as it stands:
award_date_change = merged_df.loc[merged_df['award_date_change'] == 'yes'] #this is my data frame
class uploading_to_GC:
def __init__(self, file_name, file_type, already_exists): #constructor where I want to pass my data frame, file type to be saved to, and specifying if the file already exists in my folder
self.file_name = file_name
self.file_type = file_type
self.already_exists = already_exists
def print_file_name(self):
self.file_name.head(5)
def private_workspace(self):
commonPath = os.path.expanduser(r"~\path")
GCdocs = commonPath + '384593683' + '\\'
path = GCdocs + "" + file_name
if len(self.file_name) != 0 and self.already_exists == True: #if a file already exists in Gfolder
if self.file_type == "csv": #for csv files
GC_old = pd.read_csv(path)
GC_new = GC_old.append(self.file_name, ignore_index=True)
GC_new.to_csv(path, index = False)
print("csv file is updated to private workspace in GCdocs")
elif self.file_type == "xlsx": #for xlsx files
GC_old = pd.read_csv(path)
GC_new = GC_old.append(self.file_name, ignore_index=True)
GC_new.to_excel(path, index = False)
print("excel file is updated to private workspace in GCdocs")
else:
print("unrecognized file type")
elif len(self.file_name) != 0 and self.already_exists == False: #if a file does FOLDER already exist in folder
if self.file_type == "csv":
self.file_name.to_csv(path,index=False)
if self.file_type == "xlsx":
self.file_name.to_excel(path,index=False)
else:
print("unrecognized file type")
else:
print("there is no data to upload")
award_date_change = uploading_to_GC(award_date_change,"csv", False)
award_date_change.private_workspace
I am aware that I don't need to use a class to do this, but I wanted to challenge myself to start using classes more often. Any help would be appreciated
You can pass and store a df in a Class as a data member very simply:
class Foo:
def __init__(df: pd.DataFrame):
self.df = df
# or, if you want to be sure you don't modify the original df
self.df = df.copy()
df = pd.DataFrame()
foo_obj = Foo(df)
Edit: the : pd.DataFrame is for type-hinting. This does not affect the actual code, but is merely useful to the reader that we are expecting a pd.DataFrame as input. Good IDEs will also give you an error if you don't pass a DataFrame.

Getting "ValueError: substring not found" when trying to parse gzipped CSV file

I am new in python. I am getting Error in the line
headline = csv_str[:csv_str.index('\n')];
Error is:
ValueError: substring not found
I do not know, why it is coming? I search google, however, could not find the solution? Here is the complete code
# This is a sample Python script.
# Press Shift+F10 to execute it or replace it with your code.
# Press Double Shift to search everywhere for classes, files, tool windows, actions, and settings.
import matplotlib as mpl;
import matplotlib.pyplot as plt;
import numpy as np;
import gzip;
# import StringIO; Udated one is below
# import io.StringIO
from io import StringIO
def parse_header_of_csv(csv_str):
# Isolate the headline columns: int(csv_str.index('\n'))
headline = csv_str[:csv_str.index('\n')];
columns = headline.split(',');
# The first column should be timestamp:
assert columns[0] == 'timestamp';
# The last column should be label_source:
assert columns[-1] == 'label_source';
# Search for the column of the first label:
for (ci, col) in enumerate(columns):
if col.startswith('label:'):
first_label_ind = ci;
break;
pass;
# Feature columns come after timestamp and before the labels:
feature_names = columns[1:first_label_ind];
# Then come the labels, till the one-before-last column:
label_names = columns[first_label_ind:-1];
for (li, label) in enumerate(label_names):
# In the CSV the label names appear with prefix 'label:', but we don't need it after reading the data:
assert label.startswith('label:');
label_names[li] = label.replace('label:', '');
pass;
return (feature_names, label_names);
def parse_body_of_csv(csv_str, n_features):
# Read the entire CSV body into a single numeric matrix:
full_table = np.loadtxt(StringIO.StringIO(csv_str), delimiter=',', skiprows=1);
# Timestamp is the primary key for the records (examples):
timestamps = full_table[:, 0].astype(int);
# Read the sensor features:
X = full_table[:, 1:(n_features + 1)];
# Read the binary label values, and the 'missing label' indicators:
trinary_labels_mat = full_table[:, (n_features + 1):-1]; # This should have values of either 0., 1. or NaN
M = np.isnan(trinary_labels_mat); # M is the missing label matrix
Y = np.where(M, 0, trinary_labels_mat) > 0.; # Y is the label matrix
return (X, Y, M, timestamps);
'''
Read the data (precomputed sensor-features and labels) for a user.
This function assumes the user's data file is present.
'''
def read_user_data(uuid):
user_data_file = '%s.features_labels.csv.gz' % uuid;
# Read the entire csv file of the user:
with gzip.open(user_data_file, 'rt') as fid:
csv_str = fid.read();
pass;
(feature_names, label_names) = parse_header_of_csv(csv_str);
n_features = len(feature_names);
(X, Y, M, timestamps) = parse_body_of_csv(csv_str, n_features);
return (X, Y, M, timestamps, feature_names, label_names);
def print_hi(name):
# Use a breakpoint in the code line below to debug your script.
print(f'Hi, {name}') # Press Ctrl+F8 to toggle the breakpoint.
# Press the green button in the gutter to run the script.
if __name__ == '__main__':
uuid = '1155FF54-63D3-4AB2-9863-8385D0BD0A13';
(X, Y, M, timestamps, feature_names, label_names) = read_user_data(uuid);
# See PyCharm help at https://www.jetbrains.com/help/pycharm/
TracBack
Traceback (most recent call last):
File "C:\Program Files\JetBrains\PyCharm 2020.2.1\plugins\python\helpers\pydev\pydevd.py", line 1448, in _exec
pydev_imports.execfile(file, globals, locals) # execute the script
File "C:\Program Files\JetBrains\PyCharm 2020.2.1\plugins\python\helpers\pydev\_pydev_imps\_pydev_execfile.py", line 18, in execfile
exec(compile(contents+"\n", file, 'exec'), glob, loc)
File "C:/Users/Nafees Ahmed/PycharmProjects/Extra_Sensory_Experimetns/main.py", line 92, in <module>
(X, Y, M, timestamps, feature_names, label_names) = read_user_data(uuid);
File "C:/Users/Nafees Ahmed/PycharmProjects/Extra_Sensory_Experimetns/main.py", line 77, in read_user_data
(feature_names, label_names) = parse_header_of_csv(csv_str);
File "C:/Users/Nafees Ahmed/PycharmProjects/Extra_Sensory_Experimetns/main.py", line 17, in parse_header_of_csv
headline = csv_str[:csv_str.index('\n')];
ValueError: substring not found
gzip.open() uses binary mode by default, so csv_str is a byte string, not a string. Open the file in text mode to get ordinary strings. Change
with gzip.open(user_data_file, 'r') as fid:
to
with gzip.open(user_data_file, 'rt') as fid:
Also, instead of writing your own code to parse the CSV file, I suggest you use the csv module.
gzip.open opens its argument in binary mode by default, which means its read method returns a bytes object.
Because of that, the argument to csv_str.index must also be a bytes object:
headline = csv_str[:csv_str.index(b'\n')]

Python - match string to csv value, then extract adjacent column

I'm very green when it comes to Python, so please forgive my disgusting formatting or poor optimization.
I'm trying to write a script to sort files into new folders based on their name.
In order to match their name to the correct new location, I have a csv file with two columns; the first is part of the name of the file, and the second is the correct folder it belongs in.
So far I have everything written to extract the parts of the file names I need, but now I'm stuck as to how I can match the strings I have to a value in the csv, and then extract the adjacent column.
This is what I have so far:
import os
import csv
def openCSV(csvFile):
file = open(csvFile)
reader = csv.DictReader(file)
data = list(reader)
return data
def findDemoName(fileName):
demoName = fileName[16:]
demoName = demoName[:-11]
return demoName
def moveFiles(sortingFile, sourceDirectory, destinationDirectory):
sortingCSV = openCSV(sortingFile)
srcDir = sourceDirectory
destDir = destinationDirectory
for filename in os.listdir(srcDir):
name = findDemoName(filename)
print(name)
# begin program
if __name__ == "__main__":
# set the CSV used to sort the files
fileToSortFrom = '<csv used for sorting>'
inputDirectory = '<where the files are located>'
outputDirectory = '<where I want to move the files>'
moveFiles(fileToSortFrom, inputDirectory, outputDirectory)
Right now it just prints the extracted portion of the file name and prints it so I could make sure it was doing what I wanted.
So my next steps are
1. Match the extracted portion of the file name to a matching value in the first column of the csv
2. Take the value adjacent to the match and use it to complete the destination path for the file to be moved to
I found this thread match names in csv file to filename in folder, but I don't understand where in the answer the csv is being matched to.
If I need to clear up some points let me know and I will.
Thank you in advance for reading :)
EDIT:
I've tried to stumble my way through this, and here's what I have so far:
import os, shutil
import csv
def openCSV(csvFile):
file = open(csvFile)
reader = csv.DictReader(file)
data = list(reader)
return data
"""def createReader(csvFile):
file = open(csvFile)
reader = csv.DictReader(file)
return reader"""
def extractDemoName(fileName):
originalName = fileName
demoName = fileName[16:]
demoName = demoName[:-11]
return demoName
def moveFiles(sortingFile, sourceDirectory, destinationDirectory, prefix, suffix):
reader = openCSV(sortingFile)
#reader = createReader(sortingFile)
srcDir = sourceDirectory
destDir = destinationDirectory
column1 = 'DemographicName'
column2 = 'DemographicTypeName'
folder = ''
for filename in os.listdir(srcDir):
name = extractDemoName(filename)
for row in reader:
if row(column1) == name:
folder = row(column2)
destination = destDir + folder
file = prefix + name + suffix
shutil.copy(file, destination)
print('Moved ' + file + ' to ' + destination)
#else reader.next()
print(name)
# begin program
if __name__ == "__main__":
# set the CSV used to sort the files
fileToSortFrom = '<csv file>'
inputDirectory = '<source path>'
outputDirectory = '<destination path>'
filePrefix = '<beginning text of files>'
fileSuffix = '<ending text of files>'
moveFiles(fileToSortFrom, inputDirectory, outputDirectory, filePrefix, fileSuffix)
But now I'm receiving the following error instead:
Traceback (most recent call last):
File "script.py", line 63, in <module>
moveFiles(fileToSortFrom, inputDirectory, outputDirectory, filePrefix, fileSuffix)
File "script.py", line 38, in moveFiles
if row(column1) == name:
TypeError: 'collections.OrderedDict' object is not callable
There is the problem (line 38)
if row(column1) == name:
it should be
if row[column1] == name:
I haven't checked any other logic in the script :)
This script reads the files from the directory you pass in method move_files's from_dir.
It checks if the file in the from_dir exists in the csv_file and if it does, it gets the location and moves it to that directory.
import os
import csv
import shutil
def get_file_sorter_dict(csv_file):
return dict(list(csv.reader(open(csv_file))))
def move_files(csv_file, from_dir, to_dir):
file_sorter_dict = get_file_sorter_dict(csv_file)
for filename in os.listdir(from_dir):
if file_sorter_dict.get(filename):
# you can use the location to move the file from csv_file
# move_to = file_sorter_dict.get(filename)
# shutil.move(filename, move_to)
# or you can use to_dir to move the file.
shutil.move(filename, to_dir)
if __name__ == "__main__":
move_files('files_sorter.csv', '.', '../')
The csv I am using looks like:
name, location
"foo.txt","../"
"baz.txt","../"

Tfidf empty vocabulary; perhaps the documents only contain stop words

curernttly I am working on a project and using Tfidf to transform X_train data which contain the text data. When I am using count_vectorizer.fit_transform(X_train) I get this error:
Traceback (most recent call last):
File "train.py", line 100, in <module>
counts = count_vectorizer.fit_transform(X_train)
File "/home/vishalthadari/Documents/Seperation 1/API's/Confirmation API/python 3 /env/lib/python3.6/site-packages/sklearn/feature_extraction/text.py", line 869, in fit_transform
self.fixed_vocabulary_)
File "/home/vishalthadari/Documents/Seperation 1/API's/Confirmation API/python 3 /env/lib/python3.6/site-packages/sklearn/feature_extraction/text.py", line 811, in _count_vocab
raise ValueError("empty vocabulary; perhaps the documents only"
ValueError: empty vocabulary; perhaps the documents only contain stop words
I read other stackoverflow questions like this Link But i cannot able to understand how to split the data of X_train
Here's my Train.py file
import os
import numpy
from pandas import DataFrame
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
NEWLINE = '\n'
TRAVEL = 'Travel'
OTHER = 'Other'
SOURCES = [
('data/travel', TRAVEL),
('data/other', OTHER),
]
SKIP_FILES = {'cmds', '.DS_Store'}
SEED = 0 # for reproducibility
def read_files(path):
#Reads all files in all directories mentioned in SOURCES
for root, dir_names, file_names in os.walk(path):
for path in dir_names:
read_files(os.path.join(root, path))
for file_name in file_names:
if file_name not in SKIP_FILES:
file_path = os.path.join(root, file_name)
if os.path.isfile(file_path):
past_header, lines = False, []
f = open(file_path, encoding="latin-1")
for line in f:
if past_header:
lines.append(line)
elif line == NEWLINE:
past_header = True
f.close()
content = NEWLINE.join(lines)
yield file_path, content
def build_data_frame(path, classification):
#Returns a data frame of all the files read using read_files()
data_frame = DataFrame({'text': [], 'class': []})
for file_name, text in read_files(path):
data_frame = data_frame.append(
DataFrame({'text': [text], 'class': [classification]}, index=[file_name]))
return data_frame
data = DataFrame({'text': [], 'class': []})
for path, classification in SOURCES:
data = data.append(build_data_frame(path, classification))
data = data.reindex(numpy.random.permutation(data.index))
#Training data
X_train = numpy.asarray(data['text'])
count_vectorizer = CountVectorizer()
counts = count_vectorizer.fit_transform(X_train)
I followed all the solutions but still didnt solved the issue. Is i am doing the wrong apprach to transform the data if i am doing right then why i am getting this error.
Thanks in Advance

Categories