for root, dirs, files in os.walk(path):
for file in files:
print(os.path.join(d, file))
for i in xrange(0, len(files)):
for files[i] in files:
corpus = open(os.path.join(d, files[i]), 'rb')
corpus = corpus.read()
# corpus = [line.lstrip() for line in corpus.split("\n")]
lne = []
# print(lne)
for line in corpus.split("\n"):
line = re.sub(' +', ' ', line)
line = line.upper()
lne.append(line.lstrip())
I tried line2 = next(iter(line))
But it does not produce the result I want. As I have split up the text corpus into newlines, I would expect something like next(iter(line)) to work. So what I want is to get the line of code that it loops, but also get one line after 'line'.
I start with just two files:
one.text
this + that
then now
and two.text
science poetry
pigs + cows
... in the folder "C:\scratch\sample\two.txt".
The main thing I'd like to mention is the availability of a relatively new way of processing the contents of files and folders in Python, the module pathlib, which is documented in Chapter 11. It usually makes life easier.
>>> from pathlib import Path
>>> for file_name in Path('c:/scratch/sample').glob('*'):
... with open(str(file_name)) as f:
... result_line = []
... for line in f.readlines():
... result_line.append(line.strip().upper().replace(' +', ' '))
... print (' '.join(result_line))
...
THIS THAT THEN NOW
SCIENCE POETRY PIGS COWS
I understood you to mean that you want to replace occurrences of ' +' with just one blank, and to turn entire lines into uppercase.
I want to mention also that: (a) it's best to avoid using names like file that might (or might not be) special words in the Python language because using them can make debugging difficult, (b) it's a good idea to use with when you open a file because then the system arranges to close the file when you leave the scope of the with, and (c) the one nuisance I find with using pathlib is that one must use something like str with a result (in this case file_name) to turn it into a file name that open can use.
I hope this is useful information.
Use an index to access the list.
for root, dirs, files in os.walk(path):
for file in files:
print(os.path.join(d, file))
for i in xrange(0, len(files)):
for files[i] in files:
corpus = open(os.path.join(d, files[i]), 'rb')
corpus = corpus.read()
lne = []
lines = corpus.split("\n")
for i in xrange(0, len(lines) - 1):
line = re.sub(' +', ' ', lines[i])
line = line.upper()
lne.append(line.lstrip())
line2 = lines[i+1]
Here i is a value between 0 and number of lines - 2. Thus in the loop you can access:
line = lines[i]
line2 = lines[i + 1]
Related
I have several txt files that I would like to search and print a single line if it starts with certain words (<meta property="og:description" content=). I currently have this code, which I want to search just each file in the specified folder:
import glob
filepath = '/Volumes/hardDrive/Folder/files/*'
corpus = glob.glob(filepath)
for textfile in corpus:
f = open(textfile, 'r')
pTxt = []
for ln in f:
if ln.startswith(r'<meta property="og:description" content='):
pTxt.append(ln[2:])
print(pTxt)
Right now, it’s returning [] (without stopping) when it shouldn’t be, which it’s also returning when I shorten the text to “<meta” (which should return several more results). How can I fix this so that it only prints the target line from each file?
import glob
filepath = '/home/filepath/*.txt'
for textfile in glob.glob(filepath):
pTxt = []
f = open(textfile, 'r').readlines()
for ln in f:
if ln.startswith(r'<meta property="og:description" content='):
pTxt.append(ln)
print(pTxt)
This Should Work Fine
There are lot of files in a directory. I need to find a pattern and replace it with a new string in every file. I am able to do so, but when there are multiple occurrences of pattern in a file then code only replaces the last match not all the matches.
def find_and_replace(path):
only_files = [f for f in listdir(path) if isfile(join(path, f))]
os.chdir(path)
pattern = '$$ENV$$'
final_string = ''
for i in iter(only_files):
for n, line in enumerate(open(i)):
if pattern in line:
first_string = (line.split(os.path.splitext(os.path.basename(i))[0])[0])
second_string = first_string.split('.')[0]
final_string = second_string.rsplit(' ', 1)[1]
print final_string
for newline in fileinput.FileInput(i, inplace=1):
newline = newline.replace(final_string, "Mayank")
Can you please help me here.
code only replaces the last match not all the matches.
Here is a skeleton of your code:
for fname in only_files: #OUTER FOR-LOOP
for line in open(fname): #FIRST FOR-LOOP
...
final_string = second_string.rsplit(' ', 1)[1]
for newline in fileinput.FileInput(fname, inplace=1): #SECOND FOR-LOOP
newline = newline.replace(final_string, "Mayank")
The outer for loop gets a file name.
The first for loop assigns values to final_string over and over again as it steps through every line in the file. After the for loop terminates, final_string contains a substring from the last line that contained the pattern.
The second for loop steps though every line in the file and does replacements for the value of final_string. The value of final_string never changes in the second for loop.
You need to create an array of the matches, then use the second for loop to do the replacements.
I don't know who wrote that code, but it is way too convoluted for anyone to follow, so if you want to post on public forums and get help from people, then you need to aspire to write code more like this:
def find_and_replace(path):
fnames = [
fname for fname in os.listdir(path)
if os.path.isfile(os.path.join(path, fname))
]
pattern = 'hello'
os.chdir(path)
for fname in fnames:
with open(fname) as f:
for line in f:
if pattern in line:
name, ext = os.path.splitext(fname)
first_string = line.split(name)[0]
second_string = first_string.split('.')[0]
final_string = second_string.rsplit(' ', 1)[1]
for newline in fi.FileInput(fname, inplace=1):
newline = newline.replace(final_string, "Mayank")
print(newline.rstrip())
In other words, use descriptive variable names--the variable name i is not appropriate for a filename; and do not try to do too much in one line of code. Your os.path.splitext line is a disaster.
Also, because it is not obvious what substring you are extracting from a matching line, it would have been helpful to anyone trying to answer your question to have some samples of your files.
The following code just replaces a string.
Textfiles is the folder I created for testing.
import os
import fileinput
from os import listdir
from os.path import isfile, join
def find_and_replace(path):
print(path)
onlyfiles = [f for f in listdir(path) if isfile(join(path, f))]
os.chdir(path)
pattern = 'Salma'
final_string = 'haris'
for i in onlyfiles:
# Read in the file
with open(i, 'r') as file :
filedata = file.read()
# Replace the target string
filedata = filedata.replace(pattern, final_string)
# Write the file out again
with open(i, 'w') as file:
file.write(filedata)
find_and_replace("textfiles")
I have 2 txt files (a and b_).
file_a.txt contains a long list of 4-letter combinations (one combination per line):
aaaa
bcsg
aacd
gdee
aadw
hwer
etc.
file_b.txt contains a list of letter combinations of various length (some with spaces):
aaaibjkes
aaleoslk
abaaaalkjel
bcsgiweyoieotpwe
csseiolskj
gaelsi asdas
aaaloiersaaageehikjaaa
hwesdaaadf wiibhuehu
bcspwiopiejowih
gdeaes
aaailoiuwegoiglkjaaake
etc.
I am looking for a python script that would allow me to do the following:
read file_a.txt line by line
take each 4-letter combination (e.g. aaai)
read file_b.txt and find all the various-length letter combinations starting with the 4-letter combination (eg. aaaibjkes, aaailoiersaaageehikjaaa, aaailoiuwegoiglkjaaaike etc.)
print the results of each search in a separate txt file named with the 4-letter combination.
File aaai.txt:
aaaibjkes
aaailoiersaaageehikjaaa
aaailoiuwegoiglkjaaake
etc.
File bcsi.txt:
bcspwiopiejowih
bcsiweyoieotpwe
etc.
I'm sorry I'm a newbie. Can someone point me in the right direction, please. So far I've got only:
#I presume I will have to use regex at some point
import re
file1 = open('file_a.txt', 'r').readlines()
file2 = open('file_b.txt', 'r').readlines()
#Should I look into findall()?
I hope this would help you;
file1 = open('file_a.txt', 'r')
file2 = open('file_b.txt', 'r')
#get every item in your second file into a list
mylist = file2.readlines()
# read each line in the first file
while file1.readline():
searchStr = file1.readline()
# find this line in your second file
exists = [s for s in mylist if searchStr in s]
if (exists):
# if this line exists in your second file then create a file for it
fileNew = open(searchStr,'w')
for line in exists:
fileNew.write(line)
fileNew.close()
file1.close()
What you can do is to open both files and run both files down line by line using for loops.
You can have two for loops, the first one reading file_a.txt as you will be reading through it only once. The second will read through file_b.txt and look for the string at the start.
To do so, you will have to use .find() to search for the string. Since it is at the start, the value should be 0.
file_a = open("file_a.txt", "r")
file_b = open("file_b.txt", "r")
for a_line in file_a:
# This result value will be written into your new file
result = ""
# This is what we will search with
search_val = a_line.strip("\n")
print "---- Using " + search_val + " from file_a to search. ----"
for b_line in file_b:
print "Searching file_b using " + b_line.strip("\n")
if b_line.strip("\n").find(search_val) == 0:
result += (b_line)
print "---- Search ended ----"
# Set the read pointer to the start of the file again
file_b.seek(0, 0)
if result:
# Write the contents of "results" into a file with the name of "search_val"
with open(search_val + ".txt", "a") as f:
f.write(result)
file_a.close()
file_b.close()
Test Cases:
I am using the test cases in your question:
file_a.txt
aaaa
bcsg
aacd
gdee
aadw
hwer
file_b.txt
aaaibjkes
aaleoslk
abaaaalkjel
bcsgiweyoieotpwe
csseiolskj
gaelsi asdas
aaaloiersaaageehikjaaa
hwesdaaadf wiibhuehu
bcspwiopiejowih
gdeaes
aaailoiuwegoiglkjaaake
The program produces an output file bcsg.txt as it is supposed to with bcsgiweyoieotpwe inside.
Try this:
f1 = open("a.txt","r").readlines()
f2 = open("b.txt","r").readlines()
file1 = [word.replace("\n","") for word in f1]
file2 = [word.replace("\n","") for word in f2]
data = []
data_dict ={}
for short_word in file1:
data += ([[short_word,w] for w in file2 if w.startswith(short_word)])
for single_data in data:
if single_data[0] in data_dict:
data_dict[single_data[0]].append(single_data[1])
else:
data_dict[single_data[0]]=[single_data[1]]
for key,val in data_dict.iteritems():
open(key+".txt","w").writelines("\n".join(val))
print(key + ".txt created")
This question already has answers here:
Python raising FileNotFoundError for file name returned by os.listdir
(3 answers)
Closed 4 years ago.
so I want to open each file in a directory (there are 4 plain text documents in this directory ). And do something like find specific words and its number of occurrence in each file.
This is code I used, but I got error of no such file or directory, but i when print the path, it clearly shows each file's name.
import re
import os
path = 'C:\\Python27\\projects\\Alabama\\New folder'
pattern = re.compile(r"\bmay not\b",re.IGNORECASE)
pattern1 = re.compile(r"\bshall\b",re.IGNORECASE)
pattern2 = re.compile(r"\bmust\b",re.IGNORECASE)
pattern3 = re.compile(r"\bprohibited\b",re.IGNORECASE)
pattern4 = re.compile(r"\brequired\b",re.IGNORECASE)
for filenames in os.listdir(path):
with open(filenames) as myfile:
total = 0
total1 = 0
total2 = 0
total3 = 0
total4 = 0
for line in myfile:
m = re.findall(pattern, line)
m1 = re.findall(pattern1, line)
m2 = re.findall(pattern2, line)
m3 = re.findall(pattern3, line)
m4 = re.findall(pattern4, line)
total += len(m)
total1 += len(m1)
total2 += len(m2)
total3 += len(m3)
total4 += len(m4)
print total, total1, total2, total3, total4
my question is: how to perform the task mentioned above, to find number of word occurrence of specific words("shall""must", etc) for each documents separately in the directory?
listdir returns only the file names. You have to append the path back to the file names in order to open them.
for filenames in os.listdir(path):
with open(os.path.join(path, filenames)) as myfile:
As for counting the words, you have several options, depending on how exactly you wish to count and what you define an "occurance" to be. For example, you can read the whole file as a string, then use str.count method to count only the occurances of specific words.
for filenames in os.listdir(path):
with open(os.path.join(path, filenames)) as myfile:
content = myfile.read().lower() # to essentially ignore the case
shall_count = content.count('shall')
I have a code to go through text files in a folder and look for specific word matches and count those. For example in file 1.txt I have word 'one' mentioned two times. So, my output should be:
1.txt | 2
print >> out, paper + "|" + str(hit_count)
Does not return me anything. Maybe str(hit_count) is not the right variable to print?
Any advise? Thanks.
for word in text:
if re.match("(.*)(one|two)(.*)", word)
hit_count = hit_count + 1
print >> out, paper + "|" + str(hit_count)
If I understand what you are trying to do, you don't really need a regex.
import glob
#glob.glob the directory to get a list of files - you didn't specify
for fname in file_list:
with open(fname,'r') as f:
# if files are very long consider line by line
# for line in f:
file_content = f.read()
count = file_content.count('one')
print '{0} | {1}'.format(fname, count)