How to loop and delete stop words from a folder - python

I am currently working on the task of deleting stop words. This code can be run, but I would like to ask how to change it into a loop statement, that is, loop to extract stop words in a folder instead of a single file. It might be the "file1.... this statement", but I don't know how to change it. The code is attached as follows, Thanks!
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
stop_words = set(stopwords.words('english'))
file1 = open(
r"D:\1.1 SEC EDGAR年报源文件 (10Q_10KA_10QA)\2001\QTR1\20010102_10-K-A_edgar_data_1024302_0001092388-00-500453.txt")
line = file1.read()
words = word_tokenize(line)
words_witout_stop_words = ["" if word in stop_words else word for word in words]
new_words = " ".join(words_witout_stop_words).strip()
appendFile = open(
r"D:\1.1 SEC EDGAR年报源文件 (10Q_10KA_10QA)\2001\QTR1\20010102_10-K-A_edgar_data_1024302_0001092388-00-500453.txt", 'w')
appendFile.write(new_words)
appendFile.close()

A simple fix for this is to get all the files in the folder and run the code on them.
I have mentioned the whole code here:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import os
path = r"D:\1.1 SEC EDGAR年报源文件 (10Q_10KA_10QA)\2001\QTR1\""
files = os.listdir(path)
stop_words = set(stopwords.words('english'))
for i in files:
file1 = open(path + i)
line = file1.read()
words = word_tokenize(line)
words_witout_stop_words = ["" if word in stop_words else word for word in words]
new_words = " ".join(words_witout_stop_words).strip()
appendFile = open(path + i, 'w')
appendFile.write(new_words)
appendFile.close()

Related

Python not reading my file, i'm searching for occurrences of words in a text file

I'm searching occurrences of different words in a text file.
I'm not good at python but i did that on google colab.
import os
from google.colab import drive
drive.mount('/content/drive/', force_remount=True)
if not os.path.exists('/content/drive/My Drive/Miserables'):
os.makedirs('/content/drive/My Drive/Miserables')
root_dir = '/content/drive/My Drive/Miserables/'
os.listdir('/content/drive/My Drive/Miserables')
with open("/content/drive/My Drive/Miserables/miserable.txt", 'r') as f:
myString = f.readlines()
print(len(myString))
searchWords = ["example"]
for word in searchWords:
print(f"Word '{word}' appeared {myString.count(word)} time/s.")
The thing is python doesnt actually count the number of words, and i have 0 in results when i know that those words are actually in the text.
Can somebody help me please? Thanks you.
I guess the problem is that you use f.readlines() to get the file content.
This function returns a list of every line in the file.
e.g.
["foo foo faa", "faa foo faa"]
This means youre searching for the word in this list.
Try f.read() instead.
f.readlines() gives you the list with every element representing the line.
For ex. if the text is:
I'm here
this is here too
Bye buddy
will give you the list:
[
"I'm here",
"this is here too",
"Bye buddy"
]
To solve it either use f.read() instead of f.readlines()
import os
from google.colab import drive
drive.mount('/content/drive/', force_remount=True)
if not os.path.exists('/content/drive/My Drive/Miserables'):
os.makedirs('/content/drive/My Drive/Miserables')
root_dir = '/content/drive/My Drive/Miserables/'
os.listdir('/content/drive/My Drive/Miserables')
with open("/content/drive/My Drive/Miserables/miserable.txt", 'r') as f:
myString = f.read()
print(len(myString))
searchWords = ["example"]
for word in searchWords:
print(f"Word '{word}' appeared {myString.count(word)} time/s.")
Or
Loop it:
import os
from google.colab import drive
drive.mount('/content/drive/', force_remount=True)
if not os.path.exists('/content/drive/My Drive/Miserables'):
os.makedirs('/content/drive/My Drive/Miserables')
root_dir = '/content/drive/My Drive/Miserables/'
os.listdir('/content/drive/My Drive/Miserables')
with open("/content/drive/My Drive/Miserables/miserable.txt", 'r') as f:
myString = f.readlines()
print(len(myString))
searchWords = "example"
count = 0
for i in myString:
count += i.count(searchWords)
print(f"Word '{word}' appeared {count} time/s.")

Searching the plant names in a text file using python

I want to search all the plant names that are in this text file that I have made using the code below. I do not have a list of plant names or a specific plant name. Is there a way to search and display for all/every plant name in the text file?
from nltk.stem.porter import *
from nltk.tokenize import word_tokenize
ps = PorterStemmer()
f = open("5.txt")
text = f.read()
f.close
new_text = word_tokenize(text)
with open("V Token.txt","w") as f:
for w in new_text:
print(w, file=f)

how to access and open files in folder automatically and check similarity with input file in python

i am making a desktop tool for plagiarism checking between documents. I use stopwords, vectorizer tf-idf etc and use cosine similarity to check similarity between two documents
{import nltk, string
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
userinput1 = input ("Enter file name:")
myfile1 = open(userinput1).read()
stop_words = set(stopwords.words("english"))
word1 = nltk.word_tokenize(myfile1)
filtration_sentence = []
for w in word1:
word = word_tokenize(myfile1)
filtered_sentence = [w for w in word if not w in stop_words]
print(filtered_sentence)
userinput2 = input ("Enter file name:")
myfile2 = open(userinput2).read()
stop_words = set(stopwords.words("english"))
word2 = nltk.word_tokenize(myfile2)
filtration_sentence = []
for w in word2:
word = word_tokenize(myfile2)
filtered_sentence = [w for w in word if not w in stop_words]
print(filtered_sentence)
stemmer = nltk.stem.porter.PorterStemmer()
remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)
def stem_tokens(tokens):
return [stemmer.stem(item) for item in tokens]
'''remove punctuation, lowercase, stem'''
def normalize(text):
return stem_tokens(nltk.word_tokenize(text.lower().translate(remove_punctuation_map)))
vectorizer = TfidfVectorizer(tokenizer=normalize, stop_words='english')
def cosine_sim(myfile1, myfile2):
tfidf = vectorizer.fit_transform([myfile1, myfile2])
return ((tfidf * tfidf.T).A)[0,1]
print(cosine_sim(myfile1,myfile2))}
but the problem is "i have to check similarity of input file from user with the number of files in folder. i tried my best to access folder ,open files automatically but not succeed. "anyone here who can tell me how to access folder containing files and open files one by one and compare with input file.i am using python 3.4.4 and window 7
As per my understanding you need to get all the files present in a directory/ folder
import os
fileList = os.listdir('path_to_the_directory')
for eachFile in fileList:
with open(eachFile, 'rb') as _fp:
fileData = _fp.read()
print("FILE DATA (%s):\n\n%s\n\n"%(_fp.name, fileData))
This will iterate through all the file in a directory and call the function doSomething on the file pointer

Python how to cap a string from files path

In python2, how do I limit the length of a string from importing all txt files from a directory? like wordlength = 6000
import glob
raw_text = ""
path = "/workspace/simple/*.txt"
for filename in glob.glob(path):
with open(filename, 'r') as f:
for line in f:
raw_text += line
words = raw_text.split()
print(words)
this code only feeds in all txt files and prints in on screen. How do I limit it to 6000 words and only prints 6000 words?
import glob
raw_text = ""
path = "/workspace/simple/*.txt"
for filename in glob.glob(path):
with open(filename, 'r') as f:
for line in f:
if len(raw_text.split())< N: ###here you put your number
raw_text += line
else:
break
words = raw_text.split()
print(words)
Assuming you are wanting 6000 or less words from each file ?
import glob, sys
path = sys.argv[1]
count = int(sys.argv[2]) if len(sys.argv) > 2 else 60
words = []
for file in glob.glob(path):
with open(file) as f:
words += f.read().split()[:count]
print(words)
>>>python test.py "/workspace/simple/*.txt" 6000
You could also set up a dictionary for words to file :
import glob, sys
path = sys.argv[1]
count = int(sys.argv[2]) if len(sys.argv) > 2 else 60
fwords = {}
for file in glob.glob(path):
with open(file) as f:
fwords[file] = f.read().split()[:count]
print(fwords)
If you want only files with the count of words in them
for file in glob.glob(path):
with open(file) as f:
tmp = f.read().split()
if len(tmp) == count : # only the count
fwords[file] = tmp
That depends on your definition of a word. If it's simply text separated by white space, it's fairly easy: count the words as they go past, and stop when you have enough. For instance:
word_limit = 6000
word_count = 0
for line in f:
word_count += len(line.split())
if word_count > word_limit:
break
raw_text += line
If you want exactly 6000 words, you can modify the loop to grab enough words from the last line to make 6000 exactly.
If you want to make it a little more effective, then drop raw_text and build words within the loop, one line at a time, with
line_words = line.split()
words.extend(line_words)
In this case, you'll want to use len(line_words) for your check.
Try replacing your code with this:
for filename in glob.glob(path):
with open(filename, 'r') as f:
word_limit = 12000
word_count = 0
for line in f:
word_count += len(line)
if word_count > word_limit:
break
raw_text += line

ValueError when using sklearn to do TF-IDF based on NLP

Here is the code:
import nltk
import string
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.porter import PorterStemmer
path = '/opt/datacourse/data/parts'
token_dict = {}
stemmer = PorterStemmer()
def stem_tokens(tokens, stemmer):
stemmed = []
for item in tokens:
stemmed.append(stemmer.stem(item))
return stemmed
def tokenize(text):
tokens = nltk.word_tokenize(text)
stems = stem_tokens(tokens, stemmer)
return stems
for subdir, dirs, files in os.walk(path):
for file in files:
file_path = subdir + os.path.sep + file
shakes = open(file_path, 'r')
text = shakes.read()
lowers = text.lower()
no_punctuation = lowers.translate(None, string.punctuation)
token_dict[file] = no_punctuation
tfidf = TfidfVectorizer(tokenizer=tokenize, stop_words='english')
tfs = tfidf.fit_transform(token_dict.values())
After running,it turned out to be:
File "D:\Python27\lib\site-packages\sklearn\feature_extraction\text.py", line 751, in _count_vocab
raise ValueError("empty vocabulary; perhaps the documents only"
ValueError: empty vocabulary; perhaps the documents only contain stop words
According to others' replies,I've chencked text.py and confirmed that min_def = 1 in _init_
Can anyone tell me what's the problem?Much appreciated.

Categories