Tfidf empty vocabulary; perhaps the documents only contain stop words - python

curernttly I am working on a project and using Tfidf to transform X_train data which contain the text data. When I am using count_vectorizer.fit_transform(X_train) I get this error:
Traceback (most recent call last):
File "", line 100, in <module>
counts = count_vectorizer.fit_transform(X_train)
File "/home/vishalthadari/Documents/Seperation 1/API's/Confirmation API/python 3 /env/lib/python3.6/site-packages/sklearn/feature_extraction/", line 869, in fit_transform
File "/home/vishalthadari/Documents/Seperation 1/API's/Confirmation API/python 3 /env/lib/python3.6/site-packages/sklearn/feature_extraction/", line 811, in _count_vocab
raise ValueError("empty vocabulary; perhaps the documents only"
ValueError: empty vocabulary; perhaps the documents only contain stop words
I read other stackoverflow questions like this Link But i cannot able to understand how to split the data of X_train
Here's my file
import os
import numpy
from pandas import DataFrame
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
NEWLINE = '\n'
TRAVEL = 'Travel'
OTHER = 'Other'
('data/travel', TRAVEL),
('data/other', OTHER),
SKIP_FILES = {'cmds', '.DS_Store'}
SEED = 0 # for reproducibility
def read_files(path):
#Reads all files in all directories mentioned in SOURCES
for root, dir_names, file_names in os.walk(path):
for path in dir_names:
read_files(os.path.join(root, path))
for file_name in file_names:
if file_name not in SKIP_FILES:
file_path = os.path.join(root, file_name)
if os.path.isfile(file_path):
past_header, lines = False, []
f = open(file_path, encoding="latin-1")
for line in f:
if past_header:
elif line == NEWLINE:
past_header = True
content = NEWLINE.join(lines)
yield file_path, content
def build_data_frame(path, classification):
#Returns a data frame of all the files read using read_files()
data_frame = DataFrame({'text': [], 'class': []})
for file_name, text in read_files(path):
data_frame = data_frame.append(
DataFrame({'text': [text], 'class': [classification]}, index=[file_name]))
return data_frame
data = DataFrame({'text': [], 'class': []})
for path, classification in SOURCES:
data = data.append(build_data_frame(path, classification))
data = data.reindex(numpy.random.permutation(data.index))
#Training data
X_train = numpy.asarray(data['text'])
count_vectorizer = CountVectorizer()
counts = count_vectorizer.fit_transform(X_train)
I followed all the solutions but still didnt solved the issue. Is i am doing the wrong apprach to transform the data if i am doing right then why i am getting this error.
Thanks in Advance


AttributeError: 'DataFrame' object has no attribute 'seek'

In the DataProcessor class, the raw_file_processing, dataset_csv, classes_csv, and idset_csv functions process the raw datafile and output csv files that can be read by the read_archive function.
My code raised AttributeError: 'DataFrame' object has no attribute 'seek' error.
import pandas as pd
import warnings
import numpy as np
import os
import zipfile
import re
from sklearn.model_selection import train_test_split
class DataProcesser:
def __init__(self, archive_path, col_id='ID', col_class='class', col_classname='class_name', col_set='set',
read_on_init=True, **kwargs):
self.archive_path = archive_path
self.archive = zipfile.ZipFile(self.archive_path, 'r')
self.col_id = col_id
self.col_class = col_class
self.col_classname = col_classname
self.col_set = col_set
self.dataset = None
self.dataset_cropped = None
self.id_set = None
self.classes = None
self.train_set = None
self.validation_set = None
self.test_set = None
self.logs = []
self.stats = None
self.flag_subset = False
self.flag_process = False
self.flag_split = False
self.measurement_df = None
if read_on_init:
def raw_file_processing(self):
# If the path contains HTAN CODEX data, perform the following processing steps
if os.path.isdir(archive_path):
# 'Class' refers to the independent variable
# The class info is the 3rd column tile_num in the current example
# The rationale for looking at tile_num is that if we're examining tumor progression, we can observe the relative positions of the tumor growth
# Tumor progression may be denoted by the corresponding values of tumor progression markers/antibodies such as CEA
# In the future, we may append all the tumor patient files and normal patient files and then assign patient number as "class"
self.col_classname = self.archive_path.iloc[2]
# Dummy-code the classes
self.col_class = pd.get_dummies(self.col_classname)
# Create the ID series by concatenating columns 1-3
self.col_id = self.archive_path.assign(
ID=self.archive_path[['cell_id:cell_id', 'region:region', 'tile_num:tile_num']].apply(
lambda row: '_'.join([str(each) for each in row]), axis=1))
self.col_id = self.archive_path.drop(columns=['cell_id:cell_id', 'region:region', 'tile_num:tile_num'])
# Obtain measurement info
# Normalize data against blank/empty columns
# log-transform the data
for col in self.archive_path[9:]:
if re.findall(r"Blank|Empty", col):
background = col
for index, row in col:
norm_data = row / background
self.measurement_df = np.log2(norm_data)
return self.archive_path, self.col_id, self.col_class, self.measurement_df
def dataset_csv(self):
# If the path contains HTAN CODEX data, perform the following processing steps
if os.path.isdir(self.archive_path):
"""Col 1: ID
Col 2: class
Col 3-n: measurements"""
id_col = self.col_id
self.col_class = self.col_class.to_frame()
frames = [id_col, self.col_class, self.measurement_df]
self.dataset = pd.concat(frames)
data_csv = self.dataset.to_csv("../input_data/dataset.csv")
return data_csv
def classes_csv(self):
# If the path contains HTAN CODEX data, perform the following processing steps
if os.path.isdir(self.archive_path):
# Remove any duplicate rows with the same col_class and cls_col info
self.cls_df = pd.DataFrame({'class': [self.col_class], 'class_name': [self.col_classname]})
self.cls_df.drop_duplicate(keep=False, inplace=True)
# Save as csv file
return self.cls_df
def idset_csv(self):
# If the path contains HTAN CODEX data, perform the following processing steps
if os.path.isdir(self.archive_path):
# Get the ids
ids = self.archive_path[0]
# Train-test-validation split
train, test = train_test_split(ids, test_size=0.2, random_state=1)
train, val = train_test_split(train, test_size=0.25, random_state=1)
# Assuming train, val, test are dataframes
# A string is assigned to the "set" column.
train.loc[:, 'set'] = 'train'
val.loc[:, 'set'] = 'val'
test.loc[:, 'set'] = 'test'
# Save as csv file
id_set = pd.concat([train, val, test], axis=0)
id_set_csv = id_set.to_csv('../input_data/id_set.csv', index=False)
return id_set_csv
def zip_files(self):
# Create a ZipFile object for dataset.csv, classes.csv, and id_set.csv
zip = ZipFile("", "w")
return zip
def read_archive(self, datatable=True, **kwargs):
Read a zip archive, without extraction, than contains:
* data as .csv, observations in rows, measurements in columns. Names of columns must have the format:
A_1, A_2, A_3,..., C_1, C_2,... where A and C are groups (sensors) and 1,2,3... measurement time
* IDs of training/validation/test as .csv
* Explicit name of classes as .csv
:return: 2 pandas, one with raw data, one with IDs
if datatable:
from datatable import fread
self.dataset = fread('dataset.csv'), **kwargs).to_pandas()
self.id_set = fread('id_set.csv'), **kwargs).to_pandas()
self.classes = fread('classes.csv'), **kwargs).to_pandas()
except ModuleNotFoundError:
warnings.warn('datatable module not found, using pandas instead. To prevent this message from appearing'
' use "datatable = False" when reading the archive.')
self.dataset = pd.read_csv('dataset.csv'))
self.id_set = pd.read_csv('id_set.csv'))
self.classes = pd.read_csv('classes.csv'))
self.dataset = pd.read_csv('dataset.csv'))
self.id_set = pd.read_csv('id_set.csv'))
self.classes = pd.read_csv('classes.csv'))
self.logs.append('Read archive: {0}'.format(self.archive_path))
return None
input_path = "//wsl$/Ubuntu-20.04/home/melissachua/CODEX/input_data"
# Open all the subfolders within path
for root, dirs, files in os.walk(input_path):
for file in files:
with open(os.path.join(root, file), "r") as data:
data_file = pd.read_csv(data)
data = DataProcesser(data_file, datatable=False)
meas_var = None
start_time = None
end_time = None
# Open all the subfolders within path
for root, dirs, files in os.walk(input_path):
for file in files:
with open(os.path.join(root, file), "r") as data:
data_file = pd.read_csv(data)
# The data object is used to automatically derive some parameters (e.g. number of classes)
data = DataProcesser(data_file, datatable=False)
> Traceback (most recent call last): File
> "C:/Users/User/PycharmProjects/CODEX/", line 171, in <module>
> data = DataProcesser(data_file, datatable=False) File "C:/Users/User/PycharmProjects/CODEX/", line 16, in __init__
> self.archive = zipfile.ZipFile(self.archive_path, 'r') File "C:\Users\User\AppData\Local\Programs\Python\Python38\lib\",
> line 1269, in __init__
> self._RealGetContents() File "C:\Users\User\AppData\Local\Programs\Python\Python38\lib\",
> line 1332, in _RealGetContents
> endrec = _EndRecData(fp) File "C:\Users\User\AppData\Local\Programs\Python\Python38\lib\",
> line 264, in _EndRecData
>, 2) File "C:\Users\User\PycharmProjects\CODEX\venv\lib\site-packages\pandas\core\",
> line 5487, in __getattr__
> return object.__getattribute__(self, name) AttributeError: 'DataFrame' object has no attribute 'seek'
> Process finished with exit code 1
The error you are getting is from the zipfile.ZipFile call. You should pass (the path to) a .zip file to your constructor, not a pandas DataFrame.
In your code you have the following lines:
data_file = pd.read_csv(data)
data = DataProcesser(data_file, datatable=False)
With the first line you are reading a csv file into a DataFrame and storing this DataFrame in a variable data_file.
The second line uses this DataFrame as input for your DataProcesser constructor. The constructor, however, is defined as follows:
def __init__(self, archive_path, col_id='ID', col_class='class', col_classname='class_name', col_set='set',
read_on_init=True, **kwargs):
You are passing your DataFrame as archive_path, which is not what your constructor is expecting. The constructor is expecting a str as file name (also a file would be appropriate) for example for the zipfile constructor or the os functions. See Zipfile documnetation for example.
Therefore, store your DataFrame in another variable and use the archive_path for the file path. Unfortunately, you have mixed up the DataFrame instance and the archive_path multiple times in your code. Here are a few examples.
# Constrcutor
self.archive = zipfile.ZipFile(self.archive_path, 'r')
# First method
if os.path.isdir(archive_path):
self.col_classname = self.archive_path.iloc[2]
for col in self.archive_path[9:]:

Ignoring specific json files where key [Behavior] is not present

I am working with a huge dataset of Cuckoo sandbox datset having several .JSON files, I have to create a CSV file having API stats in the behavior section of JSOn files, but if a json file doesn't have Specific file it the code stops executing.
here is my program
import pandas as pd
# As of Pandas 1.01, json_normalize as is deprecated and is now exposed in the top-level namespace.
from import json_normalize
from pathlib import Path
import json
import os
path_to_json = 'C:/Users/skdk/Desktop/Ransomware-API/Benign/'
for file_name in [file for file in os.listdir(path_to_json) if file.endswith('.json')]:
with open(path_to_json + file_name, encoding ='utf-8') as json_file:
data = json.load(json_file)
# for path in path_to_json:
# p = Path(path)
# #print(p)
# # read json50
# with'r', encoding='utf-8') as f:
# data = json.loads(
# #print(p)
# behaviorList.append(str(data['behavior']))
apiStatsList = []
for behavior in behaviorList:
for key,value in eval(behavior)['apistats'].items():
fileName = str(pathList[behaviorList.index(behavior)][:pathList[behaviorList.index(behavior)].index('.json')])+"/" + str(key)
dataset2= {}
for key,value in apiStatsList[0].items():
dataset2[key] = [value]
count = 1
for apiStat in apiStatsList[1:]:
for key,value in apiStat.items():
if(key in dataset2):
dataset2[key] = tempList
df2 = pd.DataFrame.from_dict(dataset2, orient='index')
df2 = df2.transpose()
df2 = df2.fillna(0)
I am getting a following error
KeyError Traceback (most recent call last)
<ipython-input-16-fc19a9a3c2d1> in <module>
34 data = json.load(json_file)
35 #print(data)
---> 36 behaviorList.append(str(data['behavior']))
38 # for path in path_to_json:
KeyError: 'behavior'
Any Help is appreciated.
Put it inside
'your code'
except KeyError:
'your code in case the json doesn't have behaviour. It could skip to the next file for example.'
It would catch any or specified error. And since you've said you are interested only in files with behaviour key, I think it should help you.

Merging csv files using Python

I am starting to learn Python and I would like to merge csv files. I have found the following code :
from os import chdir
from glob import glob
import pandas as pdlib
# Produce a single CSV after combining all files
def produceOneCSV(list_of_files, file_out):
# Consolidate all CSV files into one object
result_obj = pdlib.concat([pdlib.read_csv(file) for file in list_of_files])
# Convert the above object into a csv file and export
result_obj.to_csv(file_out, index=False, encoding="utf-8")
# Move to the path that holds our CSV files
csv_file_path = 'c:/Users/user/Desktop/DUT1'
# List all CSV files in the working dir
file_pattern = ".csv"
list_of_files = [file for file in glob('*.{}'.format(file_pattern))]
file_out = "ConsolidateOutput.csv"
produceOneCSV(list_of_files, file_out)
But I get those errors when I tried to compile it :
Traceback (most recent call last):
File "C:\Users\user\Desktop\DUT1\", line 26, in <module>
produceOneCSV(list_of_files, file_out)
File "C:\Users\user\Desktop\DUT1\", line 12, in produceOneCSV
result_obj = pdlib.concat([pdlib.read_csv(file) for file in list_of_files])
File "C:\Python\Python385\lib\site-packages\pandas\core\reshape\", line 274, in concat
op = _Concatenator(
File "C:\Python\Python385\lib\site-packages\pandas\core\reshape\", line 331, in __init__
raise ValueError("No objects to concatenate")
ValueError: No objects to concatenate
I don't know why it doesn't work.
Furthermore, I would like to remove the headers from all the files except the first one.
I had a similar use-case for which I developed this code chunk. You can try it like this:
import pandas as pd
from glob import glob
import os
def joinCsvFiles(outFile, dirPath, filePattern="*.csv"):
dfs = []
globPattern = os.path.join(dirPath, filePattern)
fileParts = glob(globPattern)
for filePart in fileParts:
df = pd.read_csv(filePart, index_col=False, header=0)
print("[!]. Merging {} part files to create a consolidated file\n".format(len(dfs)))
finalDf = pd.concat(dfs, sort=False)
finalDf.to_csv(outFile, index=False)
print ("[>]. Consolidated csv file generated successfully at filepath: '{}'\n".format(outFile))
except Exception as e:
raise e
if __name__ == '__main__':
joinCsvFiles("finalReport.csv", "c:/Users/user/Desktop/DUT1", "*.csv")

Splitting CSV file into multiple sheets in an Excel file based on row limit argument

Hi I am trying to run a utility script i found in github
Update 1:
Hi I modified the logic a bit more so that rather than splitting the csv into multiple csvs again i am creating a single excel file with multiple sheets containing the splits. Below is my code
import os
import csv
import openpyxl
import argparse
def find_csv_filenames( path_to_dir, suffix=".csv" ):
filenames = os.listdir(path_to_dir)
return [ filename for filename in filenames if filename.endswith( suffix ) ]
def is_binary(filename):
Return true if the given filename appears to be binary.
File is considered to be binary if it contains a NULL byte.
FIXME: This approach incorrectly reports UTF-16 as binary.
with open(filename, 'rb') as f:
for block in f:
if '\0' in block:
return True
return False
def split(filehandler, delimiter=',', row_limit=5000,
output_name_template='.xlsx', output_path='.', keep_headers=True):
class MyDialect(csv.excel):
def __init__(self, delimiter=','):
self.delimiter = delimiter
lineterminator = '\n'
my_dialect = MyDialect(delimiter=delimiter)
reader = csv.reader(filehandler, my_dialect)
index = 0
current_piece = 1
# Create a new Excel workbook
# Create a new Excel sheet with name Split1
current_out_path = os.path.join(
wb = openpyxl.Workbook()
ws = wb.create_sheet(index=index, title="Split" + str(current_piece))
current_limit = row_limit
if keep_headers:
headers =
for i, row in enumerate(reader):
if i + 1 > current_limit:
current_piece += 1
current_limit = row_limit * current_piece
ws = wb.create_sheet(index=index, title="Split" + str(current_piece))
if keep_headers:
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Splits a CSV file into multiple pieces.',
parser.add_argument('-l', '--row_limit', type=int, default=5000,
help='The number of rows you want in each output file. (default: 5000)')
args = parser.parse_args()
#Check if output path exists else create new output folder
if not os.path.exists(output_path):
with open('Logger.log', 'a+') as logfile:
logfile.write('Filename --- Number of Rows\n')
#Get list of all csv's in the current folder
filenames = find_csv_filenames(os.getcwd())
rem_filenames = []
for filename in filenames:
if is_binary(filename):
logfile.write('{} --- binary -- skipped\n'.format(filename))
with open(filename, 'rb') as infile:
reader_file = csv.reader(infile,delimiter=";",lineterminator="\n")
value = len(list(reader_file))
logfile.write('{} --- {} \n'.format(filename,value))
filenames = [item for item in filenames if item not in rem_filenames]
logfile.write('#Post Split\n')
for filename in filenames:
with open(filename, 'rb') as infile:
name = filename.split('.')[0]
split(filehandler=infile,delimiter=';',row_limit=args.row_limit,output_name_template= name + '.xlsx',output_path='Output')
I have a folder called 'CSV Files' which contains a lot of csv's which need to be split.
I am keeping this utility script in the same folder
Getting the following error on running the script:
Traceback (most recent call last):
File "", line 96, in <module>
split(filehandler=infile,delimiter=';',row_limit=args.row_limit,output_name_template= name + '.xlsx',output_path='Output')
File "", line 57, in split
File "/home/ramakrishna/.local/lib/python2.7/site-packages/openpyxl/worksheet/", line 790, in append
cell = Cell(self, row=row_idx, col_idx=col_idx, value=content)
File "/home/ramakrishna/.local/lib/python2.7/site-packages/openpyxl/cell/", line 114, in __init__
self.value = value
File "/home/ramakrishna/.local/lib/python2.7/site-packages/openpyxl/cell/", line 294, in value
File "/home/ramakrishna/.local/lib/python2.7/site-packages/openpyxl/cell/", line 191, in _bind_value
value = self.check_string(value)
File "/home/ramakrishna/.local/lib/python2.7/site-packages/openpyxl/cell/", line 156, in check_string
raise IllegalCharacterError
Can some one let me know if i have to add another for loop and go each cell in the row and append it to the sheet or can it be done in a single go. Also I seem to have made this logic a lot clumsy can this be optimized further.
Folder structure for your reference
You must pass just a name of the file as command line argument:
python 'Sports & Outdoors 2017-08-26'
Also, I tried running the above script and no matter on what CSS I run it, it doesn't return the first line (which should normally be a header) although keep_headers = True. Setting keep_headers = False also prints out the header line, which is a bit counterintuitive.
This script is meant to read a single CSV. If you want to read every CSV in a directory, you want to make another script that will loop through all the files in that directory.
import splitter as sp
import os
files = [ f for f in os.listdir('/your/directory') if f[-4:] == '.csv' ]
for file in files:
with open(file, 'r') as f:

Loading pickle error : TfidfVectorizer - Vocabulary wasn't fitted

I am making a document classifier and here is my code:
import os
import io
import numpy
from pandas import DataFrame
from sklearn.feature_extraction.text import CountVectorizer,
TfidfTransformer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
def readFiles(path):
for root, dirnames, filenames in os.walk(path):
for filename in filenames:
path = os.path.join(root, filename)
inBody = False
lines = []
f =, 'r', encoding='latin1')
for line in f:
if inBody:
elif line == '\n':
inBody = True
message = '\n'.join(lines)
yield path, message
def dataFrameFromDirectory(path, classification):
rows = []
index = []
for filename, message in readFiles(path):
rows.append({'resume': message, 'class': classification})
return DataFrame(rows, index=index)
data = DataFrame({'resume': [], 'class': []})
data = data.append(dataFrameFromDirectory(r'<path>', 'Yes'))
data = data.append(dataFrameFromDirectory(r'<path>', 'No'))
Then I split the data, and used Tfidf Vectorizer:
tf=TfidfVectorizer(min_df=1, stop_words='english')
After training and testing, I saved my classifier as a pickle file:
import pickle
with open(r'clf.pkl','wb') as f:
But when I load it again and try to use the classifier, I get TfidfVectorizer - Vocabulary wasn't fitted error. So I tried using pipeline and saved my vectorizer as well :
from sklearn.pipeline import Pipeline
with open(r'clf_1.pkl','wb') as f:
But still I get the same error. What might be going wrong?
EDIT: The pickle file was stored successfully and on the other end, I loaded the file:
import pickle
with open(r'clf_1.pkl','rb') as f:
And created a test data frame. When I do['resume']) it works fine but pred=clf.predict(test_tf) gives error TypeError: 'TfidfVectorizer' object is not iterable
Do I need to loop through the data frame that has around 15 objects?
