Tfidf empty vocabulary; perhaps the documents only contain stop words - python

curernttly I am working on a project and using Tfidf to transform X_train data which contain the text data. When I am using count_vectorizer.fit_transform(X_train) I get this error:
Traceback (most recent call last):
File "train.py", line 100, in <module>
counts = count_vectorizer.fit_transform(X_train)
File "/home/vishalthadari/Documents/Seperation 1/API's/Confirmation API/python 3 /env/lib/python3.6/site-packages/sklearn/feature_extraction/text.py", line 869, in fit_transform
self.fixed_vocabulary_)
File "/home/vishalthadari/Documents/Seperation 1/API's/Confirmation API/python 3 /env/lib/python3.6/site-packages/sklearn/feature_extraction/text.py", line 811, in _count_vocab
raise ValueError("empty vocabulary; perhaps the documents only"
ValueError: empty vocabulary; perhaps the documents only contain stop words
I read other stackoverflow questions like this Link But i cannot able to understand how to split the data of X_train
Here's my Train.py file
import os
import numpy
from pandas import DataFrame
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
NEWLINE = '\n'
TRAVEL = 'Travel'
OTHER = 'Other'
SOURCES = [
('data/travel', TRAVEL),
('data/other', OTHER),
]
SKIP_FILES = {'cmds', '.DS_Store'}
SEED = 0 # for reproducibility
def read_files(path):
#Reads all files in all directories mentioned in SOURCES
for root, dir_names, file_names in os.walk(path):
for path in dir_names:
read_files(os.path.join(root, path))
for file_name in file_names:
if file_name not in SKIP_FILES:
file_path = os.path.join(root, file_name)
if os.path.isfile(file_path):
past_header, lines = False, []
f = open(file_path, encoding="latin-1")
for line in f:
if past_header:
lines.append(line)
elif line == NEWLINE:
past_header = True
f.close()
content = NEWLINE.join(lines)
yield file_path, content
def build_data_frame(path, classification):
#Returns a data frame of all the files read using read_files()
data_frame = DataFrame({'text': [], 'class': []})
for file_name, text in read_files(path):
data_frame = data_frame.append(
DataFrame({'text': [text], 'class': [classification]}, index=[file_name]))
return data_frame
data = DataFrame({'text': [], 'class': []})
for path, classification in SOURCES:
data = data.append(build_data_frame(path, classification))
data = data.reindex(numpy.random.permutation(data.index))
#Training data
X_train = numpy.asarray(data['text'])
count_vectorizer = CountVectorizer()
counts = count_vectorizer.fit_transform(X_train)
I followed all the solutions but still didnt solved the issue. Is i am doing the wrong apprach to transform the data if i am doing right then why i am getting this error.
Thanks in Advance

Related

AttributeError: 'DataFrame' object has no attribute 'seek'

In the DataProcessor class, the raw_file_processing, dataset_csv, classes_csv, and idset_csv functions process the raw datafile and output csv files that can be read by the read_archive function.
My code raised AttributeError: 'DataFrame' object has no attribute 'seek' error.
import pandas as pd
import warnings
import numpy as np
import os
import zipfile
import re
from sklearn.model_selection import train_test_split
class DataProcesser:
def __init__(self, archive_path, col_id='ID', col_class='class', col_classname='class_name', col_set='set',
read_on_init=True, **kwargs):
self.archive_path = archive_path
self.archive = zipfile.ZipFile(self.archive_path, 'r')
self.col_id = col_id
self.col_class = col_class
self.col_classname = col_classname
self.col_set = col_set
self.dataset = None
self.dataset_cropped = None
self.id_set = None
self.classes = None
self.train_set = None
self.validation_set = None
self.test_set = None
self.logs = []
self.stats = None
self.flag_subset = False
self.flag_process = False
self.flag_split = False
self.measurement_df = None
if read_on_init:
self.read_archive(**kwargs)
def raw_file_processing(self):
# If the path contains HTAN CODEX data, perform the following processing steps
if os.path.isdir(archive_path):
# 'Class' refers to the independent variable
# The class info is the 3rd column tile_num in the current example
# The rationale for looking at tile_num is that if we're examining tumor progression, we can observe the relative positions of the tumor growth
# Tumor progression may be denoted by the corresponding values of tumor progression markers/antibodies such as CEA
# In the future, we may append all the tumor patient files and normal patient files and then assign patient number as "class"
self.col_classname = self.archive_path.iloc[2]
# Dummy-code the classes
self.col_class = pd.get_dummies(self.col_classname)
# Create the ID series by concatenating columns 1-3
self.col_id = self.archive_path.assign(
ID=self.archive_path[['cell_id:cell_id', 'region:region', 'tile_num:tile_num']].apply(
lambda row: '_'.join([str(each) for each in row]), axis=1))
self.col_id = self.archive_path.drop(columns=['cell_id:cell_id', 'region:region', 'tile_num:tile_num'])
# Obtain measurement info
# Normalize data against blank/empty columns
# log-transform the data
for col in self.archive_path[9:]:
if re.findall(r"Blank|Empty", col):
background = col
else:
for index, row in col:
norm_data = row / background
self.measurement_df = np.log2(norm_data)
return self.archive_path, self.col_id, self.col_class, self.measurement_df
def dataset_csv(self):
# If the path contains HTAN CODEX data, perform the following processing steps
if os.path.isdir(self.archive_path):
"""Col 1: ID
Col 2: class
Col 3-n: measurements"""
id_col = self.col_id
self.col_class = self.col_class.to_frame()
frames = [id_col, self.col_class, self.measurement_df]
self.dataset = pd.concat(frames)
data_csv = self.dataset.to_csv("../input_data/dataset.csv")
return data_csv
def classes_csv(self):
# If the path contains HTAN CODEX data, perform the following processing steps
if os.path.isdir(self.archive_path):
# Remove any duplicate rows with the same col_class and cls_col info
self.cls_df = pd.DataFrame({'class': [self.col_class], 'class_name': [self.col_classname]})
self.cls_df.drop_duplicate(keep=False, inplace=True)
# Save as csv file
self.cls_df.to_csv('../input_data/classes.csv')
return self.cls_df
def idset_csv(self):
# If the path contains HTAN CODEX data, perform the following processing steps
if os.path.isdir(self.archive_path):
# Get the ids
ids = self.archive_path[0]
# Train-test-validation split
ids.sample(frac=1)
train, test = train_test_split(ids, test_size=0.2, random_state=1)
train, val = train_test_split(train, test_size=0.25, random_state=1)
# Assuming train, val, test are dataframes
# A string is assigned to the "set" column.
train.loc[:, 'set'] = 'train'
val.loc[:, 'set'] = 'val'
test.loc[:, 'set'] = 'test'
# Save as csv file
id_set = pd.concat([train, val, test], axis=0)
id_set_csv = id_set.to_csv('../input_data/id_set.csv', index=False)
return id_set_csv
def zip_files(self):
# Create a ZipFile object for dataset.csv, classes.csv, and id_set.csv
zip = ZipFile("data.zip", "w")
zip.write("dataset.csv")
zip.write("classes.csv")
zip.write("id_set.csv")
zip.close()
return zip
def read_archive(self, datatable=True, **kwargs):
"""
Read a zip archive, without extraction, than contains:
* data as .csv, observations in rows, measurements in columns. Names of columns must have the format:
A_1, A_2, A_3,..., C_1, C_2,... where A and C are groups (sensors) and 1,2,3... measurement time
* IDs of training/validation/test as .csv
* Explicit name of classes as .csv
:return: 2 pandas, one with raw data, one with IDs
"""
if datatable:
try:
from datatable import fread
self.dataset = fread(self.archive.open('dataset.csv'), **kwargs).to_pandas()
self.id_set = fread(self.archive.open('id_set.csv'), **kwargs).to_pandas()
self.classes = fread(self.archive.open('classes.csv'), **kwargs).to_pandas()
except ModuleNotFoundError:
warnings.warn('datatable module not found, using pandas instead. To prevent this message from appearing'
' use "datatable = False" when reading the archive.')
self.dataset = pd.read_csv(self.archive.open('dataset.csv'))
self.id_set = pd.read_csv(self.archive.open('id_set.csv'))
self.classes = pd.read_csv(self.archive.open('classes.csv'))
else:
self.dataset = pd.read_csv(self.archive.open('dataset.csv'))
self.id_set = pd.read_csv(self.archive.open('id_set.csv'))
self.classes = pd.read_csv(self.archive.open('classes.csv'))
self.check_datasets()
self.logs.append('Read archive: {0}'.format(self.archive_path))
return None
input_path = "//wsl$/Ubuntu-20.04/home/melissachua/CODEX/input_data"
# Open all the subfolders within path
for root, dirs, files in os.walk(input_path):
for file in files:
with open(os.path.join(root, file), "r") as data:
data_file = pd.read_csv(data)
data = DataProcesser(data_file, datatable=False)
meas_var = None
start_time = None
end_time = None
# Open all the subfolders within path
for root, dirs, files in os.walk(input_path):
for file in files:
with open(os.path.join(root, file), "r") as data:
data_file = pd.read_csv(data)
# The data object is used to automatically derive some parameters (e.g. number of classes)
data = DataProcesser(data_file, datatable=False)
Traceback
> Traceback (most recent call last): File
> "C:/Users/User/PycharmProjects/CODEX/main.py", line 171, in <module>
> data = DataProcesser(data_file, datatable=False) File "C:/Users/User/PycharmProjects/CODEX/main.py", line 16, in __init__
> self.archive = zipfile.ZipFile(self.archive_path, 'r') File "C:\Users\User\AppData\Local\Programs\Python\Python38\lib\zipfile.py",
> line 1269, in __init__
> self._RealGetContents() File "C:\Users\User\AppData\Local\Programs\Python\Python38\lib\zipfile.py",
> line 1332, in _RealGetContents
> endrec = _EndRecData(fp) File "C:\Users\User\AppData\Local\Programs\Python\Python38\lib\zipfile.py",
> line 264, in _EndRecData
> fpin.seek(0, 2) File "C:\Users\User\PycharmProjects\CODEX\venv\lib\site-packages\pandas\core\generic.py",
> line 5487, in __getattr__
> return object.__getattribute__(self, name) AttributeError: 'DataFrame' object has no attribute 'seek'
>
> Process finished with exit code 1
The error you are getting is from the zipfile.ZipFile call. You should pass (the path to) a .zip file to your constructor, not a pandas DataFrame.
In your code you have the following lines:
data_file = pd.read_csv(data)
data = DataProcesser(data_file, datatable=False)
With the first line you are reading a csv file into a DataFrame and storing this DataFrame in a variable data_file.
The second line uses this DataFrame as input for your DataProcesser constructor. The constructor, however, is defined as follows:
def __init__(self, archive_path, col_id='ID', col_class='class', col_classname='class_name', col_set='set',
read_on_init=True, **kwargs):
You are passing your DataFrame as archive_path, which is not what your constructor is expecting. The constructor is expecting a str as file name (also a file would be appropriate) for example for the zipfile constructor or the os functions. See Zipfile documnetation for example.
Therefore, store your DataFrame in another variable and use the archive_path for the file path. Unfortunately, you have mixed up the DataFrame instance and the archive_path multiple times in your code. Here are a few examples.
# Constrcutor
self.archive = zipfile.ZipFile(self.archive_path, 'r')
...
# First method
if os.path.isdir(archive_path):
...
self.col_classname = self.archive_path.iloc[2]
...
for col in self.archive_path[9:]:

Ignoring specific json files where key [Behavior] is not present

I am working with a huge dataset of Cuckoo sandbox datset having several .JSON files, I have to create a CSV file having API stats in the behavior section of JSOn files, but if a json file doesn't have Specific file it the code stops executing.
here is my program
import pandas as pd
# As of Pandas 1.01, json_normalize as pandas.io.json.json_normalize is deprecated and is now exposed in the top-level namespace.
from pandas.io.json import json_normalize
from pathlib import Path
import json
import os
bkey=[]
infoList=[]
signaturesList=[]
fileOpsList=[]
irmaList=[]
suricataList=[]
virustotalList=[]
sysmonList=[]
resubmitList=[]
snortList=[]
behaviorList=[]
memoryList=[]
debugList=[]
#mispList=[]
targetList=[]
networkList=[]
metadataList=[]
list2=[]
#print(pathList)
path_to_json = 'C:/Users/skdk/Desktop/Ransomware-API/Benign/'
for file_name in [file for file in os.listdir(path_to_json) if file.endswith('.json')]:
with open(path_to_json + file_name, encoding ='utf-8') as json_file:
data = json.load(json_file)
#print(data)
behaviorList.append(str(data['behavior']))
# for path in path_to_json:
# p = Path(path)
# #print(p)
# # read json50
# with p.open('r', encoding='utf-8') as f:
# data = json.loads(f.read())
# #print(p)
# behaviorList.append(str(data['behavior']))
apiStatsList = []
for behavior in behaviorList:
for key,value in eval(behavior)['apistats'].items():
fileName = str(pathList[behaviorList.index(behavior)][:pathList[behaviorList.index(behavior)].index('.json')])+"/" + str(key)
list2.append(fileName)
apiStatsList.append(value)
print(fileName)
dataset2= {}
for key,value in apiStatsList[0].items():
dataset2[key] = [value]
count = 1
for apiStat in apiStatsList[1:]:
for key,value in apiStat.items():
if(key in dataset2):
while(len(dataset2[key])!=count):
dataset2[key].append(0)
dataset2[key].append(apiStat[key])
else:
tempList=[0]*(count)
tempList.append(value)
dataset2[key] = tempList
count=count+1
dataset2['Directory']=list2
df2 = pd.DataFrame.from_dict(dataset2, orient='index')
df2 = df2.transpose()
df2 = df2.fillna(0)
df2=df2.set_index('Directory')
#df2
df2.to_csv('Benign.csv')
I am getting a following error
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
<ipython-input-16-fc19a9a3c2d1> in <module>
34 data = json.load(json_file)
35 #print(data)
---> 36 behaviorList.append(str(data['behavior']))
37
38 # for path in path_to_json:
KeyError: 'behavior'
Any Help is appreciated.
Put it inside
try:
'your code'
except KeyError:
'your code in case the json doesn't have behaviour. It could skip to the next file for example.'
```
It would catch any or specified error. And since you've said you are interested only in files with behaviour key, I think it should help you.

Merging csv files using Python

I am starting to learn Python and I would like to merge csv files. I have found the following code :
from os import chdir
from glob import glob
import pandas as pdlib
# Produce a single CSV after combining all files
def produceOneCSV(list_of_files, file_out):
# Consolidate all CSV files into one object
result_obj = pdlib.concat([pdlib.read_csv(file) for file in list_of_files])
# Convert the above object into a csv file and export
result_obj.to_csv(file_out, index=False, encoding="utf-8")
# Move to the path that holds our CSV files
csv_file_path = 'c:/Users/user/Desktop/DUT1'
chdir(csv_file_path)
# List all CSV files in the working dir
file_pattern = ".csv"
list_of_files = [file for file in glob('*.{}'.format(file_pattern))]
print(list_of_files)
file_out = "ConsolidateOutput.csv"
produceOneCSV(list_of_files, file_out)
But I get those errors when I tried to compile it :
Traceback (most recent call last):
File "C:\Users\user\Desktop\DUT1\test.py", line 26, in <module>
produceOneCSV(list_of_files, file_out)
File "C:\Users\user\Desktop\DUT1\test.py", line 12, in produceOneCSV
result_obj = pdlib.concat([pdlib.read_csv(file) for file in list_of_files])
File "C:\Python\Python385\lib\site-packages\pandas\core\reshape\concat.py", line 274, in concat
op = _Concatenator(
File "C:\Python\Python385\lib\site-packages\pandas\core\reshape\concat.py", line 331, in __init__
raise ValueError("No objects to concatenate")
ValueError: No objects to concatenate
I don't know why it doesn't work.
Furthermore, I would like to remove the headers from all the files except the first one.
I had a similar use-case for which I developed this code chunk. You can try it like this:
import pandas as pd
from glob import glob
import os
def joinCsvFiles(outFile, dirPath, filePattern="*.csv"):
dfs = []
globPattern = os.path.join(dirPath, filePattern)
fileParts = glob(globPattern)
for filePart in fileParts:
df = pd.read_csv(filePart, index_col=False, header=0)
dfs.append(df)
print("[!]. Merging {} part files to create a consolidated file\n".format(len(dfs)))
try:
finalDf = pd.concat(dfs, sort=False)
finalDf.to_csv(outFile, index=False)
print ("[>]. Consolidated csv file generated successfully at filepath: '{}'\n".format(outFile))
except Exception as e:
raise e
if __name__ == '__main__':
joinCsvFiles("finalReport.csv", "c:/Users/user/Desktop/DUT1", "*.csv")

Splitting CSV file into multiple sheets in an Excel file based on row limit argument

Hi I am trying to run a utility script i found in github
https://gist.github.com/Athmailer/4cdb424f03129248fbb7ebd03df581cd
Update 1:
Hi I modified the logic a bit more so that rather than splitting the csv into multiple csvs again i am creating a single excel file with multiple sheets containing the splits. Below is my code
import os
import csv
import openpyxl
import argparse
def find_csv_filenames( path_to_dir, suffix=".csv" ):
filenames = os.listdir(path_to_dir)
return [ filename for filename in filenames if filename.endswith( suffix ) ]
def is_binary(filename):
"""
Return true if the given filename appears to be binary.
File is considered to be binary if it contains a NULL byte.
FIXME: This approach incorrectly reports UTF-16 as binary.
"""
with open(filename, 'rb') as f:
for block in f:
if '\0' in block:
return True
return False
def split(filehandler, delimiter=',', row_limit=5000,
output_name_template='.xlsx', output_path='.', keep_headers=True):
class MyDialect(csv.excel):
def __init__(self, delimiter=','):
self.delimiter = delimiter
lineterminator = '\n'
my_dialect = MyDialect(delimiter=delimiter)
reader = csv.reader(filehandler, my_dialect)
index = 0
current_piece = 1
# Create a new Excel workbook
# Create a new Excel sheet with name Split1
current_out_path = os.path.join(
output_path,
output_name_template
)
wb = openpyxl.Workbook()
ws = wb.create_sheet(index=index, title="Split" + str(current_piece))
current_limit = row_limit
if keep_headers:
headers = reader.next()
ws.append(headers)
for i, row in enumerate(reader):
if i + 1 > current_limit:
current_piece += 1
current_limit = row_limit * current_piece
ws = wb.create_sheet(index=index, title="Split" + str(current_piece))
if keep_headers:
ws.append(headers)
ws.append(row)
wb.save(current_out_path)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Splits a CSV file into multiple pieces.',
prefix_chars='-+')
parser.add_argument('-l', '--row_limit', type=int, default=5000,
help='The number of rows you want in each output file. (default: 5000)')
args = parser.parse_args()
#Check if output path exists else create new output folder
output_path='Output'
if not os.path.exists(output_path):
os.makedirs(output_path)
with open('Logger.log', 'a+') as logfile:
logfile.write('Filename --- Number of Rows\n')
logfile.write('#Unsplit\n')
#Get list of all csv's in the current folder
filenames = find_csv_filenames(os.getcwd())
filenames.sort()
rem_filenames = []
for filename in filenames:
if is_binary(filename):
logfile.write('{} --- binary -- skipped\n'.format(filename))
rem_filenames.append(filename)
else:
with open(filename, 'rb') as infile:
reader_file = csv.reader(infile,delimiter=";",lineterminator="\n")
value = len(list(reader_file))
logfile.write('{} --- {} \n'.format(filename,value))
filenames = [item for item in filenames if item not in rem_filenames]
filenames.sort()
logfile.write('#Post Split\n')
for filename in filenames:
#try:
with open(filename, 'rb') as infile:
name = filename.split('.')[0]
split(filehandler=infile,delimiter=';',row_limit=args.row_limit,output_name_template= name + '.xlsx',output_path='Output')
I have a folder called 'CSV Files' which contains a lot of csv's which need to be split.
I am keeping this utility script in the same folder
Getting the following error on running the script:
Traceback (most recent call last):
File "csv_split.py", line 96, in <module>
split(filehandler=infile,delimiter=';',row_limit=args.row_limit,output_name_template= name + '.xlsx',output_path='Output')
File "csv_split.py", line 57, in split
ws.append(row)
File "/home/ramakrishna/.local/lib/python2.7/site-packages/openpyxl/worksheet/worksheet.py", line 790, in append
cell = Cell(self, row=row_idx, col_idx=col_idx, value=content)
File "/home/ramakrishna/.local/lib/python2.7/site-packages/openpyxl/cell/cell.py", line 114, in __init__
self.value = value
File "/home/ramakrishna/.local/lib/python2.7/site-packages/openpyxl/cell/cell.py", line 294, in value
self._bind_value(value)
File "/home/ramakrishna/.local/lib/python2.7/site-packages/openpyxl/cell/cell.py", line 191, in _bind_value
value = self.check_string(value)
File "/home/ramakrishna/.local/lib/python2.7/site-packages/openpyxl/cell/cell.py", line 156, in check_string
raise IllegalCharacterError
openpyxl.utils.exceptions.IllegalCharacterError
Can some one let me know if i have to add another for loop and go each cell in the row and append it to the sheet or can it be done in a single go. Also I seem to have made this logic a lot clumsy can this be optimized further.
Folder structure for your reference
You must pass just a name of the file as command line argument:
python splitter.py 'Sports & Outdoors 2017-08-26'
Also, I tried running the above script and no matter on what CSS I run it, it doesn't return the first line (which should normally be a header) although keep_headers = True. Setting keep_headers = False also prints out the header line, which is a bit counterintuitive.
This script is meant to read a single CSV. If you want to read every CSV in a directory, you want to make another script that will loop through all the files in that directory.
import splitter as sp
import os
files = [ f for f in os.listdir('/your/directory') if f[-4:] == '.csv' ]
for file in files:
with open(file, 'r') as f:
sp.split(f)

Loading pickle error : TfidfVectorizer - Vocabulary wasn't fitted

I am making a document classifier and here is my code:
import os
import io
import numpy
from pandas import DataFrame
from sklearn.feature_extraction.text import CountVectorizer,
TfidfTransformer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
def readFiles(path):
for root, dirnames, filenames in os.walk(path):
for filename in filenames:
path = os.path.join(root, filename)
inBody = False
lines = []
f = io.open(path, 'r', encoding='latin1')
for line in f:
if inBody:
lines.append(line)
elif line == '\n':
inBody = True
f.close()
message = '\n'.join(lines)
yield path, message
def dataFrameFromDirectory(path, classification):
rows = []
index = []
for filename, message in readFiles(path):
rows.append({'resume': message, 'class': classification})
index.append(filename)
return DataFrame(rows, index=index)
data = DataFrame({'resume': [], 'class': []})
data = data.append(dataFrameFromDirectory(r'<path>', 'Yes'))
data = data.append(dataFrameFromDirectory(r'<path>', 'No'))
Then I split the data, and used Tfidf Vectorizer:
tf=TfidfVectorizer(min_df=1, stop_words='english')
data_traintf=tf.fit_transform(data_train)
mnb=MultinomialNB()
mnb.fit(data_traintf,class_train)
After training and testing, I saved my classifier as a pickle file:
import pickle
with open(r'clf.pkl','wb') as f:
pickle.dump(mnb,f)
But when I load it again and try to use the classifier, I get TfidfVectorizer - Vocabulary wasn't fitted error. So I tried using pipeline and saved my vectorizer as well :
from sklearn.pipeline import Pipeline
classifier=Pipeline([('tfidf',tf),('multiNB',mnb)])
with open(r'clf_1.pkl','wb') as f:
pickle.dump(classifier,f)
But still I get the same error. What might be going wrong?
EDIT: The pickle file was stored successfully and on the other end, I loaded the file:
import pickle
with open(r'clf_1.pkl','rb') as f:
clf=pickle.load(f)
And created a test data frame. When I do test_tf=tf.fit(test['resume']) it works fine but pred=clf.predict(test_tf) gives error TypeError: 'TfidfVectorizer' object is not iterable
Do I need to loop through the data frame that has around 15 objects?

Categories