Python multiline regex from text to text MULTILINE - python

I want to extract info from .cdp file ( content downloader file, program to parsing, can be opened in notepad) file looks like:
....
...
<CD_PARSING_RB_9>0</CD_PARSING_RB_9>
<CD_PARSING_RB_F9_3>0</CD_PARSING_RB_F9_3>
<CD_PARSING_LB_1>http://www.prospect.chisites.net/opportunities/?pageno=1
http://www.prospect.chisites.net/opportunities/?pageno=2
http://www.prospect.chisites.net/opportunities/?pageno=3
http://www.prospect.chisites.net/opportunities/?pageno=4</CD_PARSING_LB_1>
<CD_PARSING_EDIT_26>0</CD_PARSING_EDIT_26>
<CD_PARSING_EDIT_27><a href=/jobs/</CD_PARSING_EDIT_27>
<CD_PARSING_EDIT_28>0</CD_PARSING_EDIT_28>
I want to extract links using python, I found some solution bt it works partially. (just deletes <CD_PARSING_LB1>tags), it should delete everything but the links between those two tags. Solution may be also using search, but this one wouldn't work for some reason.
code:
import string
import codecs
import re
import glob
outfile = open('newout.txt', 'w+')
try:
for file in glob.glob("*.cdp"):
print(file)
infile = open(file, 'r')
step1 = re.sub('.*<CD_PARSING_LB_1>', '',infile.read(), re.DOTALL)
step2 = re.sub('</CD_PARSING_LB_1>.*','', step1, re.DOTALL)
outfile.write(str(step1))
except Exception as ex:
print ex
raw_input()
Please help me in any way to get those links separated... Thanks
full file example:
Content Downloader X1 (11.9940) project file (parsing)
<F68_CB_5>0</F68_CB_5>
<F68_CB_8>0</F68_CB_8>
<F34_CB_4>0</F34_CB_4>
<F70_CB_4>0</F70_CB_4>
<F34_CB_5>0</F34_CB_5>
<F34_SE_1>0</F34_SE_1>
<F82_SE_2>0</F82_SE_2>
<F69_SE_1>1</F69_SE_1>
<F1_CMBO_8>0</F1_CMBO_8>
<F105_MEMO_1></F105_MEMO_1>
<F9_RBN_01>2</F9_RBN_01>
<F96_RB_01>1</F96_RB_01>
<F1_RBN_15>1</F1_RBN_15>
<F1_N120>1</F1_N120>
<F64_CB_01>0</F64_CB_01>
<F64_RB_01>1</F64_RB_01>
<F70_CB_03>0</F70_CB_03>
<CD_PARSING_COMBO_5>0</CD_PARSING_COMBO_5>
<F64_CB_02>0</F64_CB_02>
<F60_CB_02>0</F60_CB_02>
<F64_RE_1></F64_RE_1>
<F95_M_1></F95_M_1>
<F1_COMBO_6>0</F1_COMBO_6>
<F40_CHCKBX_555>0</F40_CHCKBX_555>
<F09_CB_01>0</F09_CB_01>
<F48_CB_02>0</F48_CB_02>
<F68_CB_01>0</F68_CB_01>
<F68_CB_02>0</F68_CB_02>
<F68_CB_03>0</F68_CB_03>
<F57_CB_41>0</F57_CB_41>
<F57_CB_43>0</F57_CB_43>
<F57_CB_45>0</F57_CB_45>
<F57_CB_47>0</F57_CB_47>
<F57_CB_49>0</F57_CB_49>
<F57_CB_51>0</F57_CB_51>
<F57_CB_53>0</F57_CB_53>
<F57_CB_55>0</F57_CB_55>
<F57_CB_57>0</F57_CB_57>
<F57_CB_59>0</F57_CB_59>
<F57_CB_61>0</F57_CB_61>
<F57_CB_63>0</F57_CB_63>
<F57_CB_65>0</F57_CB_65>
<F57_CB_67>0</F57_CB_67>
<F57_CB_69>0</F57_CB_69>
<F57_CB_71>0</F57_CB_71>
<F57_CB_73>0</F57_CB_73>
<F57_CB_75>0</F57_CB_75>
<F57_CB_77>0</F57_CB_77>
<F57_CB_79>0</F57_CB_79>
<F57_CB_42>0</F57_CB_42>
<F57_CB_44>0</F57_CB_44>
<F57_CB_46>0</F57_CB_46>
<F57_CB_48>0</F57_CB_48>
<F57_CB_50>0</F57_CB_50>
<F57_CB_52>0</F57_CB_52>
<CD_PARSING_EDIT_93>0</CD_PARSING_EDIT_93>
<CD_PARSING_EDIT_94></CD_PARSING_EDIT_94>
<CD_PARSING_EDIT_57_12></CD_PARSING_EDIT_57_12>
<CD_PARSING_EDIT_57_13></CD_PARSING_EDIT_57_13>
<CD_PARSING_EDIT_57_14></CD_PARSING_EDIT_57_14>
<CD_PARSING_EDIT_57_15></CD_PARSING_EDIT_57_15>
<CD_PARSING_EDIT_57_16></CD_PARSING_EDIT_57_16>
<CD_PARSING_EDIT_57_17></CD_PARSING_EDIT_57_17>
<CD_PARSING_EDIT_57_18></CD_PARSING_EDIT_57_18>
<CD_PARSING_RICH_50_1>[VALUE]</CD_PARSING_RICH_50_1>
<CD_PARSING_EDIT_F9_13>3</CD_PARSING_EDIT_F9_13>
<CD_PARSING_EDIT_F9_18>http://sitename.com</CD_PARSING_EDIT_F9_18>
<CD_PARSING_EDIT_F24_2>1</CD_PARSING_EDIT_F24_2>
<CD_PARSING_EDIT_F48_1></CD_PARSING_EDIT_F48_1>
<CD_PARSING_EDIT_F48_2>10</CD_PARSING_EDIT_F48_2>
<CD_PARSING_EDIT_F48_5>0</CD_PARSING_EDIT_F48_5>
<CD_PARSING_EDIT_F48_3>0</CD_PARSING_EDIT_F48_3>
<CD_PARSING_EDIT_F56_1></CD_PARSING_EDIT_F56_1>
<CD_PARSING_EDIT_F56_2>-</CD_PARSING_EDIT_F56_2>
<CD_PARSING_EDIT_F34_1></CD_PARSING_EDIT_F34_1>
<CD_PARSING_EDIT_F34_3>http://</CD_PARSING_EDIT_F34_3>
<CD_PARSING_EDIT_F40_2>Mozilla/5.0 (Windows; U; Windows NT 6.1; ru; rv:1.9.2.13) Gecko/20101203 Firefox/3.6.13 sputnik 2.1.0.18 YB/4.3.0</CD_PARSING_EDIT_F40_2>
<CD_PARSING_EDIT_F46_1></CD_PARSING_EDIT_F46_1>
<CD_PARSING_M49_1> class="entry"
id="news-id-
id="article-text"
</CD_PARSING_M49_1>
<CD_PARSING_M48_1></CD_PARSING_M48_1>
<F90_M_1></F90_M_1>
<CD_PARSING_M48_3></CD_PARSING_M48_3>
<CD_PARSING_SYN_F46_1><CD_CYCLE_GRAN_ALL!></CD_PARSING_SYN_F46_1>
<CD_PARSING_RICH_F9_1></CD_PARSING_RICH_F9_1>
<CD_PARSING_RICH_F9_2></CD_PARSING_RICH_F9_2>
<CD_PARSING_R24_1>0</CD_PARSING_R24_1>
<F1_COMBOBOX_9>0</F1_COMBOBOX_9>
<F1_COMBOBOX_10>2</F1_COMBOBOX_10>
<CD_PARSING_RB_9>0</CD_PARSING_RB_9>
<CD_PARSING_RB_F9_3>0</CD_PARSING_RB_F9_3>
<CD_PARSING_LB_1>http://www.latestvacancies.com/wates/</CD_PARSING_LB_1>
<CD_PARSING_EDIT_26>0</CD_PARSING_EDIT_26>
<CD_PARSING_EDIT_27>Jobs/Advert/</CD_PARSING_EDIT_27>
<CD_PARSING_EDIT_28>0</CD_PARSING_EDIT_28>
<CD_PARSING_EDIT_29>?</CD_PARSING_EDIT_29>
<CD_PARSING_COMBOBOX_1>csv</CD_PARSING_COMBOBOX_1>
<CD_PARSING_RE61_1></CD_PARSING_RE61_1>
<CD_PARSING_CHECK_61_1>1</CD_PARSING_CHECK_61_1>
<CD_PARSING_RB60_1>1</CD_PARSING_RB60_1>
<CD_PARSING_SE60_1>1</CD_PARSING_SE60_1>

Use this regex pattern.
String Pattern= "(http:.*=\d{1, 7})";
See demo here https://regex101.com/r/fI3eT4/1

outfile = open('newout.txt', 'w+')
try:
for file in glob.glob("*.cdp"):
print(file)
infile = open(file, 'r')
step1 = re.sub(re.compile('.*[<]CD_PARSING_LB_1[>]', re.DOTALL), '',infile.read())
step2 = re.sub(re.compile('[<]/CD_PARSING_LB_1[>].*', re.DOTALL),'', step1)
outfile.write(str(step2))
except Exception as ex:
print ex
raw_input()
try this.
the four argument of re.sub is count, not flag.
and I think using result = re.search('[<]tag1[>](.*)[<]/tag1[>]), and get the links by result.group(1) might be easier.

Try this.
use with statement to read/write files.
the file is a builtin class, use something like ifile.
the regex only need to match the pattern http:[^<]*.
code
import string
import codecs
import re
import glob
with open('newout.txt', 'w+') as outfile:
try:
for ifile in glob.glob("*.cdp"):
print (ifile)
with open(ifile, 'r') as infile:
for line in infile:
step1 = re.findall(r'(http:[^<]+)', line)
if len(step1) > 0:
outfile.write("%s\n" % step1[0].strip())
except Exception as ex:
print (ex)

Related

Python: How i Can print only a particular string starting with a specific character?

I have a text file which contains list of urls which looks like as follows:
https://www.ebay.com/itm/Egyptian-Comfort-1800-Count-4-Piece-Bed-Sheet-Set-Deep-Pocket-Bed-Sheets/142436469971?epid=1760442729&hash=item2129e00cd3%3Ag%3A7gIAAOSw3YBdRVJd&_trkparms=%2526rpp_cid%253D601435485fceeb223c6f4511&var=442541824291
Here i only want to print epid=1760442729 while reading the text file.
I have tried:
result = []`
with open('deals.txt', 'r') as f:
for line in f:
if line.startswith('?epid='):
break
result.append(line)
print(result[0].split('epid='))
But i am not getting as expected result.
Any help or suggestions will be helpful for me.
Thanks in advance
import re
s = """https://www.ebay.com/itm/Egyptian-Comfort-1800-Count-4-Piece-Bed-Sheet-Set-Deep-Pocket-Bed-Sheets/142436469971?epid=1760442729&hash=item2129e00cd3%3Ag%3A7gIAAOSw3YBdRVJd&_trkparms=%2526rpp_cid%253D601435485fceeb223c6f4511&var=442541824291
https://www.ebay.com/itm/Egyptian-Comfort-1800-Count-4-Piece-Bed-Sheet-Set-Deep-Pocket-Bed-Sheets/142436469971?epid=172442729&hash=item2129e00cd3%3Ag%3A7gIAAOSw3YBdRVJd&_trkparms=%2526rpp_cid%253D601435485fceeb223c6f4511&var=442541824291"""
for i in re.findall(r'epid=(\d+)&', s, re.MULTILINE):
print(f'epid = {i}')
epid = 1760442729
epid = 172442729
I would do it with a substring if the same structure is always used
result = []
with open('deals.txt', 'r') as f:
for line in f:
a= line.find('epid=')
b= line.find('&hash=')
print(line[a:b])
result.append(line)
Use the library that was designed to parse URL.
Example:
from urllib.parse import urlparse, parse_qs
URL='https://www.ebay.com/itm/Egyptian-Comfort-1800-Count-4-Piece-Bed-Sheet-Set-Deep-Pocket-Bed-Sheets/142436469971?epid=1760442729&hash=item2129e00cd3%3Ag%3A7gIAAOSw3YBdRVJd&_trkparms=%2526rpp_cid%253D601435485fceeb223c6f4511&var=442541824291'
url_component = urlparse(URL)
query_component = parse_qs(url_component.query)
epid_data = query_component['epid'][0]
print(f'epid = {epid_data}')

Check if a variable string exist in a text file

So guys, i'm tryng to make a password generator but i'm having this trouble:
First, the code i use for tests:
idTest= "TEST"
passwrd= str(random.randint(11, 99))
if not os.path.exists('Senhas.txt'):
txtFileW = open('Senhas.txt', 'w')
txtFileW.writelines(f'{idTest}: {passwrd}\n')
txtFileW.close()
else:
txtFileA = open('Senhas.txt', 'a')
txtFileA.write(f'{idTest}: {passwrd}\n')
txtFileA.close()
print(f'{idTest}: {passwrd}')
Well, what i'm expecting is something like this:
else:
with open('Senhas.txt', 'r+') as opened:
opened.read()
for lines in opened:
if something == idTest:
lines.replace(f'{something}', f'{idTest}')
else:
break
txtFileA = open('Senhas.txt', 'a')
txtFileA.write(f'{idTest}: {passwrd}\n')
txtFileA.close()
print(f'{idTest}: {passwrd}')
I've searched for it but all i've found are ways to separate it in 2 files (for my project it doesn't match) or with "static" strings, that doesn't match for me as well.
You can use the fileinput module to update the file in place.
import fileinput
with fileinput.input(files=('Senhas.txt'), inplace=True) as f:
for line in f:
if (line.startswith(idTest+':'):
print(f'{idTest}: {passwrd}')
else:
print(line)

How to update line with modified data in Jython?

I'm have a csv file which contains hundred thousands of rows and below are some sample lines..,
1,Ni,23,28-02-2015 12:22:33.2212-02
2,Fi,21,28-02-2015 12:22:34.3212-02
3,Us,33,30-03-2015 12:23:35-01
4,Uk,34,31-03-2015 12:24:36.332211-02
I need to get the last column of csv data which is in wrong datetime format. So I need to get default datetimeformat("YYYY-MM-DD hh:mm:ss[.nnn]") from last column of the data.
I have tried the following script to get lines from it and write into flow file.
import json
import java.io
from org.apache.commons.io import IOUtils
from java.nio.charset import StandardCharsets
from org.apache.nifi.processor.io import StreamCallback
class PyStreamCallback(StreamCallback):
def __init__(self):
pass
def process(self, inputStream, outputStream):
text = IOUtils.readLines(inputStream, StandardCharsets.UTF_8)
for line in text[1:]:
outputStream.write(line + "\n")
flowFile = session.get()
if (flowFile != None):
flowFile = session.write(flowFile,PyStreamCallback())
flowFile = session.putAttribute(flowFile, "filename", flowFile.getAttribute('filename'))
session.transfer(flowFile, REL_SUCCESS)
but I am not able to find a way to convert it like below output.
1,Ni,23,28-02-2015 12:22:33.221
2,Fi,21,29-02-2015 12:22:34.321
3,Us,33,30-03-2015 12:23:35
4,Uk,34,31-03-2015 12:24:36.332
I have checked solutions with my friend(google) and was still not able to find solution.
Can anyone guide me to convert those input data into my required output?
In this transformation the unnecessary data located at the end of each line, so it's really easy to manage transform task with regular expression.
^(.*:\d\d)((\.\d{1,3})(\d*))?(-\d\d)?
Check the regular expression and explanation here:
https://regex101.com/r/sAB4SA/2
As soon as you have a large file - better not to load it into the memory. The following code loads whole the file into the memory:
IOUtils.readLines(inputStream, StandardCharsets.UTF_8)
Better to iterate line by line.
So this code is for ExecuteScript nifi processor with python (Jython) language:
import sys
import re
import traceback
from org.apache.commons.io import IOUtils
from org.apache.nifi.processor.io import StreamCallback
from org.python.core.util import StringUtil
from java.lang import Class
from java.io import BufferedReader
from java.io import InputStreamReader
from java.io import OutputStreamWriter
class TransformCallback(StreamCallback):
def __init__(self):
pass
def process(self, inputStream, outputStream):
try:
writer = OutputStreamWriter(outputStream,"UTF-8")
reader = BufferedReader(InputStreamReader(inputStream,"UTF-8"))
line = reader.readLine()
p = re.compile('^(.*:\d\d)((\.\d{1,3})(\d*))?(-\d\d)?')
while line!= None:
# print line
match = p.search(line)
writer.write( match.group(1) + (match.group(3) if match.group(3)!=None else '') )
writer.write('\n')
line = reader.readLine()
writer.flush()
writer.close()
reader.close()
except:
traceback.print_exc(file=sys.stdout)
raise
flowFile = session.get()
if flowFile != None:
flowFile = session.write(flowFile, TransformCallback())
# Finish by transferring the FlowFile to an output relationship
session.transfer(flowFile, REL_SUCCESS)
And as soon as question is about nifi, here are alternatives that seems to be easier
the same code as above but in groovy for nifi ExecuteScript processor:
def ff = session.get()
if(!ff)return
ff = session.write(ff, {rawIn, rawOut->
// ## transform streams into reader and writer
rawIn.withReader("UTF-8"){reader->
rawOut.withWriter("UTF-8"){writer->
reader.eachLine{line, lineNum->
if(lineNum>1) { // # skip the first line
// ## let use regular expression to transform each line
writer << line.replaceAll( /^(.*:\d\d)((\.\d{1,3})(\d*))?(-\d\d)?/ , '$1$3' ) << '\n'
}
}
}
}
} as StreamCallback)
session.transfer(ff, REL_SUCCESS)
ReplaceText processor
And if regular expression is ok - the easiest way in nifi is a ReplaceText processor that could do regular expression replace line-by-line.
In this case you don't need to write any code, just build the regular expression and configure your processor correctly.
Just using pure jython. It is an example that can be adapted to OP's needs.
Define a datetime parser for this csv file
from datetime import datetime
def parse_datetime(dtstr):
mydatestr='-'.join(dtstr.split('-')[:-1])
try:
return datetime.strptime(mydatestr,'%d-%m-%Y %H:%M:%S.%f').strftime('%d-%m-%Y %H:%M:%S.%f')[:-3]
except ValueError:
return datetime.strptime(mydatestr,'%d-%m-%Y %H:%M:%S').strftime('%d-%m-%Y %H:%M:%S')
my test.csv includes data like this: ( 2015 didnt have 29 Feb had to change OP's example ).
1,Ni,23,27-02-2015 12:22:33.2212-02
2,Fi,21,28-02-2015 12:22:34.3212-02
3,Us,33,30-03-2015 12:23:35-01
4,Uk,34,31-03-2015 12:24:36.332211-02
now the solution
with open('test.csv') as fi:
for line in fi:
line_split=line.split(',')
out_line = ', '.join(word if i<3 else parse_datetime(word) for i,word in enumerate(line_split))
#print(out_line)
#you can write this out_line to a file here.
printing out_line looks like this
1, Ni, 23, 27-02-2015 12:22:33.221
2, Fi, 21, 28-02-2015 12:22:34.321
3, Us, 33, 30-03-2015 12:23:35
4, Uk, 34, 31-03-2015 12:24:36.332
You can get them with regex :
(\d\d-\d\d-\d\d\d\d\ \d\d:\d\d:)(\d+(?:\.\d+)*)(-\d\d)$
Then just replace #2 with a rounded version of #2
See regex example at regexr.com
You could even do it "nicer" by getting every single value with a capturing group and then put them into a datetime.datetime object and print it from there, but imho that would be an overkill in maintainability and loose you too much performance.
Code had no possibility to test
import re
...
pattern = '^(.{25})(\d+(?:\.\d+)*)(-\d\d)$' //used offset for simplicity
....
for line in text[1:]:
match = re.search(pattern, line)
line = match.group(1) + round(match.group(2),3) + match.group(3)
outputStream.write(line + "\n")

execfile issue with the filename argument in python

I have two python scripts. One match script, the other is regex script. I want to start with my matchParser.py script and then go to regexParser.py. I want that my regexParser knows the filename of the matchParser and continue to use. I hope I could explain it clearly. I tried a lot, but unfortunately without success.
My OutputError: TypeError: coercing to Unicode: need string or buffer, file found
matchParser.py
import glob
intfiles = glob.glob("C:\Users\x\Desktop\x\*.csv")
header_saved = False
with open('outputdre121.csv','wb') as fout:
for filename in intfiles:
with open(filename) as fin:
header = next(fin)
if not header_saved:
fout.write(header)
header_saved = True
for line in fin:
fout.write(line)
print "start next script: regexParser.py"
execfile("regexParser.py")
regexParser.py
import re
import matchParser
lines = [line.strip() for line in open(matchParser.fout)] ## here the filename from MatchParser.py
with open('outputregexdre13.csv', "w") as output_csv:
for result in lines:
match = re.search(r'((\s\d+?[1-9])!?[ ])', result)
if match: output_csv.write(match.group(0) + '\n')
thanks!
I have found the solution:
Very simple actually ..
I add to matchParser.fout the .name method
lines = [line.strip() for line in open(matchParser.fout.name)]

How to search and replace text in a file?

How do I search and replace text in a file using Python 3?
Here is my code:
import os
import sys
import fileinput
print ("Text to search for:")
textToSearch = input( "> " )
print ("Text to replace it with:")
textToReplace = input( "> " )
print ("File to perform Search-Replace on:")
fileToSearch = input( "> " )
#fileToSearch = 'D:\dummy1.txt'
tempFile = open( fileToSearch, 'r+' )
for line in fileinput.input( fileToSearch ):
if textToSearch in line :
print('Match Found')
else:
print('Match Not Found!!')
tempFile.write( line.replace( textToSearch, textToReplace ) )
tempFile.close()
input( '\n\n Press Enter to exit...' )
Input file:
hi this is abcd hi this is abcd
This is dummy text file.
This is how search and replace works abcd
When I search and replace 'ram' by 'abcd' in above input file, it works as a charm. But when I do it vice-versa i.e. replacing 'abcd' by 'ram', some junk characters are left at the end.
Replacing 'abcd' by 'ram'
hi this is ram hi this is ram
This is dummy text file.
This is how search and replace works rambcd
As pointed out by michaelb958, you cannot replace in place with data of a different length because this will put the rest of the sections out of place. I disagree with the other posters suggesting you read from one file and write to another. Instead, I would read the file into memory, fix the data up, and then write it out to the same file in a separate step.
# Read in the file
with open('file.txt', 'r') as file :
filedata = file.read()
# Replace the target string
filedata = filedata.replace('abcd', 'ram')
# Write the file out again
with open('file.txt', 'w') as file:
file.write(filedata)
Unless you've got a massive file to work with which is too big to load into memory in one go, or you are concerned about potential data loss if the process is interrupted during the second step in which you write data to the file.
fileinput already supports inplace editing. It redirects stdout to the file in this case:
#!/usr/bin/env python3
import fileinput
with fileinput.FileInput(filename, inplace=True, backup='.bak') as file:
for line in file:
print(line.replace(text_to_search, replacement_text), end='')
As Jack Aidley had posted and J.F. Sebastian pointed out, this code will not work:
# Read in the file
filedata = None
with file = open('file.txt', 'r') :
filedata = file.read()
# Replace the target string
filedata.replace('ram', 'abcd')
# Write the file out again
with file = open('file.txt', 'w') :
file.write(filedata)`
But this code WILL work (I've tested it):
f = open(filein,'r')
filedata = f.read()
f.close()
newdata = filedata.replace("old data","new data")
f = open(fileout,'w')
f.write(newdata)
f.close()
Using this method, filein and fileout can be the same file, because Python 3.3 will overwrite the file upon opening for write.
You can do the replacement like this
f1 = open('file1.txt', 'r')
f2 = open('file2.txt', 'w')
for line in f1:
f2.write(line.replace('old_text', 'new_text'))
f1.close()
f2.close()
You can also use pathlib.
from pathlib2 import Path
path = Path(file_to_search)
text = path.read_text()
text = text.replace(text_to_search, replacement_text)
path.write_text(text)
(pip install python-util)
from pyutil import filereplace
filereplace("somefile.txt","abcd","ram")
Will replace all occurences of "abcd" with "ram".
The function also supports regex by specifying regex=True
from pyutil import filereplace
filereplace("somefile.txt","\\w+","ram",regex=True)
Disclaimer: I'm the author (https://github.com/MisterL2/python-util)
Open the file in read mode. Read the file in string format. Replace the text as intended. Close the file. Again open the file in write mode. Finally, write the replaced text to the same file.
try:
with open("file_name", "r+") as text_file:
texts = text_file.read()
texts = texts.replace("to_replace", "replace_string")
with open(file_name, "w") as text_file:
text_file.write(texts)
except FileNotFoundError as f:
print("Could not find the file you are trying to read.")
Late answer, but this is what I use to find and replace inside a text file:
with open("test.txt") as r:
text = r.read().replace("THIS", "THAT")
with open("test.txt", "w") as w:
w.write(text)
DEMO
With a single with block, you can search and replace your text:
with open('file.txt','r+') as f:
filedata = f.read()
filedata = filedata.replace('abc','xyz')
f.truncate(0)
f.write(filedata)
Your problem stems from reading from and writing to the same file. Rather than opening fileToSearch for writing, open an actual temporary file and then after you're done and have closed tempFile, use os.rename to move the new file over fileToSearch.
My variant, one word at a time on the entire file.
I read it into memory.
def replace_word(infile,old_word,new_word):
if not os.path.isfile(infile):
print ("Error on replace_word, not a regular file: "+infile)
sys.exit(1)
f1=open(infile,'r').read()
f2=open(infile,'w')
m=f1.replace(old_word,new_word)
f2.write(m)
Using re.subn it is possible to have more control on the substitution process, such as word splitted over two lines, case-(in)sensitive match. Further, it returns the amount of matches which can be used to avoid waste of resources if the string is not found.
import re
file = # path to file
# they can be also raw string and regex
textToSearch = r'Ha.*O' # here an example with a regex
textToReplace = 'hallo'
# read and replace
with open(file, 'r') as fd:
# sample case-insensitive find-and-replace
text, counter = re.subn(textToSearch, textToReplace, fd.read(), re.I)
# check if there is at least a match
if counter > 0:
# edit the file
with open(file, 'w') as fd:
fd.write(text)
# summary result
print(f'{counter} occurence of "{textToSearch}" were replaced with "{textToReplace}".')
Some regex:
add the re.I flag, short form of re.IGNORECASE, for a case-insensitive match
for multi-line replacement re.subn(r'\n*'.join(textToSearch), textToReplace, fd.read()), depending on the data also '\n{,1}'. Notice that for this case textToSearch must be a pure string, not a regex!
Besides the answers already mentioned, here is an explanation of why you have some random characters at the end:
You are opening the file in r+ mode, not w mode. The key difference is that w mode clears the contents of the file as soon as you open it, whereas r+ doesn't.
This means that if your file content is "123456789" and you write "www" to it, you get "www456789". It overwrites the characters with the new input, but leaves any remaining input untouched.
You can clear a section of the file contents by using truncate(<startPosition>), but you are probably best off saving the updated file content to a string first, then doing truncate(0) and writing it all at once.
Or you can use my library :D
I got the same issue. The problem is that when you load a .txt in a variable you use it like an array of string while it's an array of character.
swapString = []
with open(filepath) as f:
s = f.read()
for each in s:
swapString.append(str(each).replace('this','that'))
s = swapString
print(s)
I tried this and used readlines instead of read
with open('dummy.txt','r') as file:
list = file.readlines()
print(f'before removal {list}')
for i in list[:]:
list.remove(i)
print(f'After removal {list}')
with open('dummy.txt','w+') as f:
for i in list:
f.write(i)
you can use sed or awk or grep in python (with some restrictions). Here is a very simple example. It changes banana to bananatoothpaste in the file. You can edit and use it. ( I tested it worked...note: if you are testing under windows you should install "sed" command and set the path first)
import os
file="a.txt"
oldtext="Banana"
newtext=" BananaToothpaste"
os.system('sed -i "s/{}/{}/g" {}'.format(oldtext,newtext,file))
#print(f'sed -i "s/{oldtext}/{newtext}/g" {file}')
print('This command was applied: sed -i "s/{}/{}/g" {}'.format(oldtext,newtext,file))
if you want to see results on the file directly apply: "type" for windows/ "cat" for linux:
####FOR WINDOWS:
os.popen("type " + file).read()
####FOR LINUX:
os.popen("cat " + file).read()
I have done this:
#!/usr/bin/env python3
import fileinput
import os
Dir = input ("Source directory: ")
os.chdir(Dir)
Filelist = os.listdir()
print('File list: ',Filelist)
NomeFile = input ("Insert file name: ")
CarOr = input ("Text to search: ")
CarNew = input ("New text: ")
with fileinput.FileInput(NomeFile, inplace=True, backup='.bak') as file:
for line in file:
print(line.replace(CarOr, CarNew), end='')
file.close ()
I modified Jayram Singh's post slightly in order to replace every instance of a '!' character to a number which I wanted to increment with each instance. Thought it might be helpful to someone who wanted to modify a character that occurred more than once per line and wanted to iterate. Hope that helps someone. PS- I'm very new at coding so apologies if my post is inappropriate in any way, but this worked for me.
f1 = open('file1.txt', 'r')
f2 = open('file2.txt', 'w')
n = 1
# if word=='!'replace w/ [n] & increment n; else append same word to
# file2
for line in f1:
for word in line:
if word == '!':
f2.write(word.replace('!', f'[{n}]'))
n += 1
else:
f2.write(word)
f1.close()
f2.close()
def word_replace(filename,old,new):
c=0
with open(filename,'r+',encoding ='utf-8') as f:
a=f.read()
b=a.split()
for i in range(0,len(b)):
if b[i]==old:
c=c+1
old=old.center(len(old)+2)
new=new.center(len(new)+2)
d=a.replace(old,new,c)
f.truncate(0)
f.seek(0)
f.write(d)
print('All words have been replaced!!!')
I have worked this out as an exercise of a course: open file, find and replace string and write to a new file.
class Letter:
def __init__(self):
with open("./Input/Names/invited_names.txt", "r") as file:
# read the list of names
list_names = [line.rstrip() for line in file]
with open("./Input/Letters/starting_letter.docx", "r") as f:
# read letter
file_source = f.read()
for name in list_names:
with open(f"./Output/ReadyToSend/LetterTo{name}.docx", "w") as f:
# replace [name] with name of the list in the file
replace_string = file_source.replace('[name]', name)
# write to a new file
f.write(replace_string)
brief = Letter()
Like so:
def find_and_replace(file, word, replacement):
with open(file, 'r+') as f:
text = f.read()
f.write(text.replace(word, replacement))
def findReplace(find, replace):
import os
src = os.path.join(os.getcwd(), os.pardir)
for path, dirs, files in os.walk(os.path.abspath(src)):
for name in files:
if name.endswith('.py'):
filepath = os.path.join(path, name)
with open(filepath) as f:
s = f.read()
s = s.replace(find, replace)
with open(filepath, "w") as f:
f.write(s)

Categories