Excel file comparing with python and regex - python

Got two excel files, one with a little ammount of id's and other with tonns of id's + ip address:
How do i compare them line by line, and then print out a concatenation of id(first file) and id+ip(second file) cells that matched with id's?
Where could i go further from here?
import re
router_id = r'[0-9]{5}' # regex for finding 5-num-symboled id like '65432'
ip_multi = r'[0-9]+(?:\.[0-9]+){3}' # for finding ip address
def parsing_func(filename, method):
with open(filename, 'r') as file:
lines = str(file.readlines())
exp_data = re.findall(method, lines)
return exp_data
table_id = set(parsing_func('file1.txt', router_id))
table_addr = set(parsing_func('file2.txt', router_id))
table_mk = set(parsing_func('file2.txt', ip_multi))
content = '\n'.join(table_id)
print(content)

Related

How to replace two different lines of text?

I need to create a file that changes the date and name of a .txt, but I can only change one or the other with this code I found on the internet, can anyone give me any tips?
Print
import os
from ast import Str
file = open("example.txt", "r")
replacement = ""
data = "02/07/2022"
name = "Alan"
for line in file:
line = line.strip()
changes = line.replace("__/__/____", data)
replacement = replacement + changes + "\n"
file.close()
fout = open("final.txt", "w")
fout.write(replacement)
fout.close()
You don't need to do this a line a time. You can replace that entire program with this:
data = "02/07/2022"
name = "Alan"
text = open("example.txt", "r").read().replace("__/__/____", data)
open("final.txt", "w").write(text)

How do I get the file name and values I am getting form a file into a dataframe python

I am looking to read more than one file that starts with access-team then after I read the files I access them and get the information after username = and put it into a dataframe having a username and filename associated to
Here is the code I currently have but doesn't it doesn't dot it all yet. I am not sure how to read the two files and incorporate them into what I have below. my current results for one file are in one column. I need the dataframe to have the username and the file name.
file name: access-team-rev.txt file two is same way to but it is called access-team-support.txt
files look like this:
import re
filename = 'access-team-rev.txt'
pattern = re.compile(r'username\s=\s(\w+)')
l = []
with open(filename, "rt") as myfile:
for line in myfile:
if pattern.search(line) != None:
l.append(line.strip('username = '))
l.append(filename.strip('.txt'))
You should append as list
l.append( [ line.strip('username = '), filename.strip('.txt')] )
and after for-loop you can do
df = pandas.DataFrame(l)
and you will have it in DataFrame
Minimal working code
import pandas as pd
l = []
l.append( ['user1','file1'] )
l.append( ['user2','file2'] )
df = pd.DataFrame(l, columns=['username', 'filename'])
print(df)
Result
username filename
0 user1 file1
1 user2 file2
EDIT:
Read data
import re
filename = 'access-team-rev.txt'
pattern = re.compile(r'username\s=\s(\w+)')
l = []
with open(filename) as myfile:
for line in myfile:
if pattern.search(line):
l.append( [ line.strip('username = '), filename.strip('.txt')] )

all data variables in the same row CSV with Python

# coding=utf-8
# Libreria RegEx de Python.
import re
# Libreria para rutas.
import os
import csv
# function betwwen: return the value between two words a and b
def between(value, a, b):
pos_a = value.find(a) # Find and validate before-part.
if pos_a == -1: return "" # Find and validate after part.
pos_b = value.rfind(b)
if pos_b == -1: return "" # Return middle part.
adjusted_pos_a = pos_a + len(a)
if adjusted_pos_a >= pos_b: return ""
return value[adjusted_pos_a:pos_b]
# function scan folder DiarioOficial
def scan_folder():
# directory 'path'
path = '/Users/anna/PycharmProjects/extractData/DiarioOficial'
# contador de ficheros del path
count = 0
# creation csv as csvFile
with open('All_Companies1.csv', 'a') as csvFile:
# iterate all paths in the folder DiarioOficial without name
for (path, dirnames, file_names) in os.walk(path):
# iterate over all the files in the path (+ file_name)
for file_name in file_names:
# Add extension that is required
if file_name.endswith(".txt"):
# summatory count files in DiarioOficial folder
count = count + 1
# concatenation path + file name
file_path=os.path.join(path, file_name)
#print(file_path)
# open and read the file path
mensaje = open(file_path).read()
# Replace a newline for a space
mensaje = mensaje.replace("\n","")
# Company Name
keywords_cap = ['SpA', 'SPA', 'LIMITADA', 'LTDA', 'S.A.', 'E.I.R.L.', 'S.L.']
# re.escape to solve the problem with metacharacters in keyword_obj
keywords_cap = map(re.escape, keywords_cap)
# sorting the items by lengh in descending order
keywords_cap.sort(key=len, reverse=True)
obj = re.compile(r'[:,;.]\s*"?([^:,;.]*?(?<!\w)(?:{}))'.format('|'.join(keywords_cap)))
if obj:
# To obtain the first match obj.search(mensaje).group(1)
company_name = obj.search(mensaje)
else:
company_name = "None"
# CVE Number of the file
regex = r"\s*CVE\s+([^|]*)"
matches = re.search(regex, mensaje)
if matches:
company_cve = matches.group(1).strip()
else:
company_cve = "None"
# Section of diariooficial.interior.gob.cl
company_sect = between(mensaje, 'SECCIÓN', 'Núm.')
if company_sect:
company_sect = company_sect
else:
company_sect = "None"
# Name of the person that constitutes the company
company_ceo = re.search(r'\sante mí,\s+([^,]*)', mensaje)
if company_ceo:
company_ceo = company_ceo.group(1)
else:
company_ceo = "None"
# File Number from Section
num_reg = r'\sNúm.\s+([^|]*)'
match_num = re.search(num_reg, mensaje)
if match_num:
company_numsect = match_num.group(1)
else:
company_numsect = "None"
# Social Capital ($)
cap = r"\s*(CAPITAL:\s+([^-]*)|Capital social:\s+([^-]*)|Capital:\s+([^-]*)|Capital:\s+([^,]*))"
caps = re.search(cap, mensaje)
if caps:
company_capital = caps.group()
else:
company_capital = 'None'
csvData = [company_name, company_cve, company_sect, company_ceo, company_numsect, company_capital]
headers = ['COMPANY NAME', 'CVE', 'SECTION','CEO NAME','NUMBER SECTOR','COMPANY CAPITAL']
writer = csv.writer(csvFile, delimiter=',') # create a csv delimited by comma
writer.writerow(headers) # print the header row
writer.writerow(csvData) # print the Data in csv
# Number of txt files
print (count)
scan_folder()
I have this script that create a csv with the data extracted from a text in specific path. In spite of the errors that can be on RegEx, mainly it extracts parts of text that it keeps them in variables and the printa in a csv. Each company must have a single line in this csv. In this way, when the csv is opened, the number of companies and all the information can be visualized by variables.
My problem is that when I see the CSV called, in this case, All_companies1, the data is not put in the same row, they jump.
Also, the titles are repeated, and I do not want them to repeat themselves
First try changing the mode for the csvFile from a (append) to w (write), also check if the editor you're using actual uses the comma as the column delimiter for csv files, since in the above picture is seems as if the comma is seen by the editor as a normal character.
Also remove any carriage return characters (\n \r) from your string before printing it, this can be done in the following code.
csvData = [str(data).replace('\n', '').replace('\r', '') for data in csvData]
Note:
if by any chance this works, there might be a problem with with having empty rows in the csv file beteen each two elements, this can be fixed by changing with open('All_Companies1.csv', 'a') as csvFile to with open('All_Companies1.csv', 'a', newline='') as csvFile

Cleaning up a messy data file to a more readable format in Python?

I have a text file (heavily modified for this example) which has some data that I want to extract and do some calculations with it. However the text file is extremely messy, so I'm trying to clean it up and write it out to new files first.
Here is the .txt file I'm working with: http://textuploader.com/5elql
I am trying to extract the data which is under the titles (called “Important title”). The only possible way to do that is to first locate a string which always occurs in the file, and its called “DATASET” because all the mess above and below the important data will cover an arbitrary number of lines, difficult to remove manually. Once that’s done I want to store the data in separate files so that it is easier to analyse like this:
http://textuploader.com/5elqw
The file names will be concatenated with the title + the date.
Here is what I have tried so far
with open("example.txt") as file:
for line in file:
if line.startswith('DATASET:'):
fileTitle = line[9:]
if line.startswith("DATE:"):
fileDate = line[:]
print(fileTitle+fileDate)
OUTPUT
IMPORTANT TITLE 1
DATE: 12/30/2015
IMPORTANT TITLE 2
DATE: 01/03/2016
So it appears my loop manages to locate the lines where the titles inside the file are and print them out. But this is where I run out of steam. I have no idea on how to extract the data under those titles from there onwards. I have tried using file.readlines() but it outputs all the mess that is in between Important Title 1 and Important Title 2.
Any advice on how I can read all the data under the titles and output them into separate files? Thanks for your time.
You could use regex.
import re
pattern = r"(\s+X\s+Y\s*)|(\s*\d+\s+\d+\s*)"
prog = re.compile(pattern)
with open("example.txt") as file:
cur_filename = ''
content = ""
for line in file:
if line.startswith('DATASET:'):
fileTitle = line[9:]
elif line.startswith("DATE:"):
fileDate = line[6:]
cur_filename = (fileTitle.strip() + fileDate.strip()).replace('/', '-')
print(cur_filename)
content_title = fileTitle + line
elif prog.match(line):
content += line
elif cur_filename and content:
with open(cur_filename, 'w') as fp:
fp.write(content_title)
fp.write(content)
cur_filename = ''
content = ''
I don't know exactly how you want to store your data but assuming you want a dictionary you could use regex to check if the incoming line matched the pattern, then because fileTitle isn't global you could use that as the key and add the values. I also added rstrip('\r\n') to remove the newline characters after fileTitle.
import re
#if you don't want to store the X and Y, just use re.compile('\d\s+\d+')
p = re.compile('(\d\s+\d+)|(X\s+Y)')
data={}
with open("input.txt") as file:
for line in file:
if line.startswith('DATASET:'):
fileTitle = line[9:].rstrip('\r\n')
if line.startswith("DATE:"):
fileDate = line[:]
print(fileTitle+fileDate)
if p.match(line):
if fileTitle not in data:
data[fileTitle]=[]
line=line.rstrip('\r\n')
data[fileTitle].append(line.split('\t'))
if len(data[fileTitle][len(data[fileTitle])-1]) == 3:
data[fileTitle][len(data[fileTitle])-1].pop()
print data
Yet another regex solution:
sep = '*************************\n'
pattern = r'DATASET[^%]*'
good_stuff = re.compile(pattern)
pattern = r'^DATASET: (.*?)$'
title = re.compile(pattern, flags = re.MULTILINE)
pattern = r'^DATE: (.*?)$'
date = re.compile(pattern, flags = re.MULTILINE)
with open(r'foo.txt') as f:
data = f.read()
for match in good_stuff.finditer(data):
data = match.group()
important_title = title.search(data).group(1)
important_date = date.search(data).group(1)
important_date = important_date.replace(r'/', '-')
fname = important_title + important_date + '.txt'
print(sep, fname)
print(data)
##with open(fname, 'w') as f:
## f.write(data)

Copying string from a specific index from one file to pasting that string on a specific place in another file

My intention was to copy a piece of string after either a colon or equal sign from File 1 , and pasting that string in File 2 in a similar location after either a colon or equal sign.
For instance, if File 1 has:
username: Stack
File 2 is originally empty:
username=
I want Stack to be copied over to File 2 after username. Currently, I'm stuck and not sure what to do. The program piece I made below doesn't copy the username. I would greatly appreciate any input!
with open("C:/Users/SO//Downloads//f1.txt", "r") as f1:
with open("C:/Users/SO//Downloads//f2.txt", "r+") as f2:
searchlines = f1.readlines()
searchlines_f2=f2.readlines()
for i, line in enumerate(searchlines):
if 'username' in line:
for l in searchlines[i:i+1]:
ind = max(l.find(':'), l.find('='), 0) #finding index of specific characters
copy_string=l[ind+1:].strip() #copying string for file 2
for l in searchlines_f2[i:i+1]:
if 'username' in line:
f2.write(copy_string)
I think something like this will get you what you need in a more maintainable and Pythonic way.
Note the use of regex as well as some string methods (e.g., startswith)
import re
SOURCE_PATH = "C:/Users/SO//Downloads//f1.txt"
TARGET_PATH = "C:/Users/SO//Downloads//f2.txt"
def _get_lines(filepath):
""" read `filepath` and return a list of strings """
with open(filepath, "r+") as fh:
return fh.readlines()
def _get_value(fieldname, text):
""" parse `text` to get the value of `fieldname` """
try:
pattern = '%s[:=]{1}\s?(.*)' % fieldname
return re.match(pattern, text).group(1)
except IndexError:
# you may want to handle this differently!
return None
def _write_target(filepath, trgt_lines):
""" write `trgt_lines` to `filepath` """
with open(filepath, "w+") as fh:
fh.writelines(trgt_lines)
src_lines = _get_lines(SOURCE_PATH)
trgt_lines = _get_lines(TARGET_PATH)
# extract field values from source file
fields = ['username', 'id', 'location']
for field in fields:
value = None
for cur_src in src_lines:
if cur_src.startswith(field):
value = _get_value(field, cur_src)
break
# update target_file w/ value (if we were able to find it)
if value is not None:
for i, cur_trgt in enumerate(trgt_lines):
if cur_trgt.startswith('{0}='.format(field)):
trgt_lines[i] = '{0}={1}'.format(field, value)
break
_write_target(TARGET_PATH, trgt_lines)

Categories