Export entire regex group to text file - python

When I print the group "print(a)" the entire group is shown. When I save it to a text file "open("sirs1.txt", "w").write(a)" only the last row is saved to the file.
import re
def main():
f = open('sirs.txt')
for lines in f:
match = re.search('(AA|BB|CC|DD)......', lines)
if match:
a = match.group()
print(a)
open("sirs1.txt", "w").write(a)
How do I save the entire group to the text file.

nosklo is correct the main problem is that you are overwriting the whole file each time you write to it. mehmattski is also correct in that you will also need to explicitly add \n to each write in order to make the output file readable.
Try this:
enter code here
import re
def main():
f = open('sirs.txt')
outputfile = open('sirs1.txt','w')
for lines in f:
match = re.search('(AA|BB|CC|DD)......', lines)
if match:
a = match.group()
print(a)
outputfile.write(a+"\n")
f.close()
outputfile.close()

the open command creates a new file, so you're creating a new file every time.
Try to create the file outside the for-loop
import re
def main():
with open('sirs.txt') as f:
with open("sirs1.txt", "w") as fw:
for lines in f:
match = re.search('(AA|BB|CC|DD)......', lines)
if match:
a = match.group()
print(a)
fw.write(a)

You need to add a newline character after each string to get them to print on separate lines:
import re
def main():
f = open('sirs.txt')
outputfile = open('sirs1.txt','w')
for lines in f:
match = re.search('(AA|BB|CC|DD)......', lines)
if match:
a = match.group()
print(a)
outputfile.write(a+'/n')
f.close()
outputfile.close()

Related

Replace incorrect urls in text file and fix them in Python

I'm getting URLS with removed forward-lashes and I basically need to correct the urls inside of a text file.
The URLs in the file look like this:
https:www.ebay.co.ukitmReds-Challenge-184-214-Holo-Shiny-Rare-Pokemon-Card-SM-Unbroken-Bonds-Rare124315281970?hash=item1cf1c4aa32%3Ag%3AXBAAAOSwJGRfSGI1&LH_BIN=1
I need to correct it to:
https://www.ebay.co.uk/itm/Reds-Challenge-184-214-Holo-Shiny-Rare-Pokemon-Card-SM-Unbroken-Bonds-Rare/124315281970?hash=item1cf1c4aa32%3Ag%3AXBAAAOSwJGRfSGI1&LH_BIN=1
So basically I need a regex or another way that will edit in those forwardslashes to each URL within the file and replace and the broken URLs in the file.
while True:
import time
import re
#input file
fin = open("ebay2.csv", "rt")
#output file to write the result to
fout = open("out.txt", "wt")
#for each line in the input file
for line in fin:
#read replace the string and write to output file
fout.write(line.replace('https://www.ebay.co.uk/sch/', 'https://').replace('itm', '/itm/').replace('https:www.ebay','https://www.ebay'))
with open('out.txt') as f:
regex = r"\d{12}"
subst = "/\\g<0>"
for l in f:
result = re.sub(regex, subst, l, 0, re.MULTILINE)
if result:
print(result)
fin.close()
fout.close()
time.sleep(1)
I eventually came up with this. It's a bit clumsy but it does the job fast enough.

Changing list to string to remove characters

I have a file that I am trying to do a word frequency list on, but I'm having trouble with the list and string aspects. I changed my file to a string to remove numbers from the file, but that ends up messing up the tokenization. The expected output is a word count of the file I am opening excluding numbers, but what I get is the following:
Counter({'<_io.TextIOWrapper': 1, "name='german/test/polarity/negative/neg_word_list.txt'": 1, "mode='r'": 1, "encoding='cp'>": 1})
done
Here's the code:
import re
from collections import Counter
def word_freq(file_tokens):
global count
for word in file_tokens:
count = Counter(file_tokens)
return count
f = open("german/test/polarity/negative/neg_word_list.txt")
clean = re.sub(r'[0-9]', '', str(f))
file_tokens = clean.split()
print(word_freq(file_tokens))
print("done")
f.close()
this ended up working, thank you to Rakesh
import re
from collections import Counter
def word_freq(file_tokens):
global count
for word in file_tokens:
count = Counter(file_tokens)
return count
f = open("german/test/polarity/negative/neg_word_list.txt")
clean = re.sub(r'[0-9]', '', f.read())
file_tokens = clean.split()
print(word_freq(file_tokens))
print("done")
f.close()
Reading further i've noticed you didn't "read" the file, you've just opened it.
if you print only opening the file:
f = open("german/test/polarity/negative/neg_word_list.txt")
print(f)
You'll notice it will tell you what the object is, "io.TextIOWrapper". So you need to read it:
f_path = open("german/test/polarity/negative/neg_word_list.txt")
f = f_path.read()
f_path.close() # don't forget to do this to clear stuff
print(f)
# >>> what's really inside the file
or another way to do this without the "close()":
# adjust your encoding
with open("german/test/polarity/negative/neg_word_list.txt", encoding="utf-8") as r:
f = r.read()
It's possible that by doing that it won't be in a list, but a plain text file, so you could iterate each line:
list_of_lines = []
# adjust your encoding
with open("german/test/polarity/negative/neg_word_list.txt", encoding="utf-8") as r:
# read each line and append to list
for line in r:
list_of_lines.append(line)

Reading and Striping a text file

import re
f= ('HelloHowAreYou')
f = re.sub(r"([a-z\d])([A-Z])", r'\1 \2', f)
# Makes the string space separated. You can use split to convert it to list
f = f.split()
print (f)
this works fine to separate all the string of text by capital letters, however when i then change the code to read a text file i have issues. can anyone shed some light why?
to read a file I'm using:
f = open('words.txt','r')
to read a file I'm using:
f = open('words.txt','r')
But that code doesn't read the file, it only opens it. Try:
my_file = open('words.txt','r')
f = file.read()
my_file.close()
Or
with open('words.txt','r') as my_file:
f = my_file.read()

Python Insert text before a specific line

I want to insert a text specifically before a line 'Number'.
I want to insert 'Hello Everyone' befor the line starting with 'Number'
My code:
import re
result = []
with open("text2.txt", "r+") as f:
a = [x.rstrip() for x in f] # stores all lines from f into an array and removes "\n"
# Find the first occurance of "Centre" and store its index
for item in a:
if item.startswith("Number"): # same as your re check
break
ind = a.index(item) #here it produces index no./line no.
result.extend(a[:ind])
f.write('Hello Everyone')
tEXT FILE:
QWEW
RW
...
Number hey
Number ho
Expected output:
QWEW
RW
...
Hello Everyone
Number hey
Number ho
Please help me to fix my code:I dont get anything inserted with my text file!Please help!
Answers will be appreciated!
The problem
When you do open("text2.txt", "r"), you open your file for reading, not for writing. Therefore, nothing appears in your file.
The fix
Using r+ instead of r allows you to also write to the file (this was also pointed out in the comments. However, it overwrites, so be careful (this is an OS limitation, as described e.g. here). The following should do what you desire: It inserts "Hello everyone" into the list of lines and then overwrites the file with the updated lines.
with open("text2.txt", "r+") as f:
a = [x.rstrip() for x in f]
index = 0
for item in a:
if item.startswith("Number"):
a.insert(index, "Hello everyone") # Inserts "Hello everyone" into `a`
break
index += 1
# Go to start of file and clear it
f.seek(0)
f.truncate()
# Write each line back
for line in a:
f.write(line + "\n")
The correct answer to your problem is the hlt one, but consider also using the fileinput module:
import fileinput
found = False
for line in fileinput.input('DATA', inplace=True):
if not found and line.startswith('Number'):
print 'Hello everyone'
found = True
print line,
This is basically the same question as here: they propose to do it in three steps: read everything / insert / rewrite everything
with open("/tmp/text2.txt", "r") as f:
lines = f.readlines()
for index, line in enumerate(lines):
if line.startswith("Number"):
break
lines.insert(index, "Hello everyone !\n")
with open("/tmp/text2.txt", "w") as f:
contents = f.writelines(lines)

Python regex findall into output file

i got an inputfile which contains a javascript code which contains many five-figure ids. I want to have these ids in a list like:
53231,53891,72829 etc
This is my actual python file:
import re
fobj = open("input.txt", "r")
text = fobj.read()
output = re.findall(r'[0-9][0-9][0-9][0-9][0-9]' ,text)
outp = open("output.txt", "w")
How can i get these ids in the output file like i want it?
Thanks
import re
# Use "with" so the file will automatically be closed
with open("input.txt", "r") as fobj:
text = fobj.read()
# Use word boundary anchors (\b) so only five-digit numbers are matched.
# Otherwise, 123456 would also be matched (and the match result would be 12345)!
output = re.findall(r'\b\d{5}\b', text)
# Join the matches together
out_str = ",".join(output)
# Write them to a file, again using "with" so the file will be closed.
with open("output.txt", "w") as outp:
outp.write(out_str)

Categories