String matching and replace - python

I have a text file like this and i want in implement in python
Enter the Username"<Username>" and phonenumber"<phonenumber>"
Enter the origin"<origin>" and destination"<destination>"
Examples:
| Username | phonenumber | origin | destination|
| JOHN | 40256786 | NYC | LONDON |
i want to replace the string which are in <> and replace with actual data, and my output will look like this :
Enter the Username "JOHN" and phonenumber "40256786"
Enter the origin "NYC" and destination "LONDON"

Update
Try:
import re
text = []
data = []
with open('data.txt') as fp:
line = ''
for line in fp:
if line.startswith('Examples'):
break
text.append(line)
text = ''.join(text)
headers = re.split('\s*\|\s*', fp.readline())[1:-1]
for line in fp:
values = re.split('\s*\|\s*', line)[1:-1]
data.append(dict(zip(headers, values)))
for d in data:
print(re.sub(r'\<(?P<key>[^>]*)\>', lambda x: d[x.group('key')], text))
Output:
Enter the Username"JOHN" and phonenumber"40256786"
Enter the origin"NYC" and destination"LONDON"
Old answer
You can use plenty of text processors to substitute text by variables: string.Template ($), format strings ({ }), Jinja2 ({{ }}). If you can, change your delimiter:
Here an example of format strings:
text = '''\
Enter the Username "{Username}" and phonenumber "{phonenumber}"
Enter the origin "{origin}" and destination "{destination}"\
'''
data = {'Username': 'John', 'phonenumber': '40256786',
'origin': 'NYC', 'destination': 'LONDON'}
print(text.format(**data))
Output:
Enter the Username "John" and phonenumber "40256786"
Enter the origin "NYC" and destination "LONDON"

One way would be to split each line by the delimiting character |. Then you can set the variables for the string accordingly.
sample_line = '| JOHN | 40256786 | NYC | LONDON |'
sample_line = sample_line.split('|')
data = {
'Username': sample_line[1],
'phonenumber': sample_line[2],
'origin': sample_line[3],
'destination': sample_line[4]
}
text = '''\
Enter the Username "{Username}" and phonenumber "{phonenumber}"
Enter the origin "{origin}" and destination "{destination}"\
'''
print(text.format(**data))
Alternatively, you should be able to use something like csv.reader

Related

How to retrieve information in the first section of the raw data only by regular expressions?

Below is a sample of the raw data which my code will process by regular expressions:
raw_data = '''
name : John
age : 26
gender : male
occupation : teacher
Father
---------------------
name : Bill
age : 52
gender : male
Mother
---------------------
name : Mary
age : 48
gender : female
'''
I want to retrieve the following part of information from the raw data and store it in a dictionary:
dict(name = 'John', age = 26, gender = 'male', occupation = 'teacher')
However, when I run my code as follows, it does not work as I expect:
import re
p = re.compile('[^-]*?^([^:\-]+?):([^\r\n]*?)$', re.M)
rets = p.findall(raw_data)
infoAboutJohnAsDict = {}
if rets != []:
for ret in rets:
infoAboutJohnAsDict[ret[0]] = ret[1]
else:
print("Not match.")
print(f'rets = {rets}')
print(f'infoAboutJohnAsDict = {infoAboutJohnAsDict}')
Can anyone give me any suggestion about how I should modify my code to achieve what I intend to do?
Here is one approach using regular expressions. We can first trim off the latter portion of the input which you don't want using re.sub. Then, use re.findall to find all key value pairs for John, and convert to a dictionary.
raw_data = re.sub(r'\s+\w+\s+-+.*', '', raw_data, flags=re.S)
matches = re.findall(r'(\w+)\s*:\s*(\w+)', raw_data)
d = dict()
for m in matches:
d[m[0]] = m[1]
print(d)
# {'gender': 'male', 'age': '26', 'name': 'John', 'occupation': 'teacher'}

Python regex pattern match starts with dot and store it in dict format

#-----------------------------------------------------------------------------------
from pprint import pprint
data = '''
.
.
.
#Long log file
-------------------------------------------------------------------------------
Section Name | Budget | Size | Prev Size | Overflow
--------------------------------+-----------+-----------+-----------+----------
.text.resident | 712924 | 794576 | 832688 | YES
.rodata.resident | 77824 | 77560 | 21496 | YES
.data.resident | 28672 | 28660 | 42308 | NO
.bss.resident | 52672 | 1051632 | 1455728 | YES
.
.
.
'''
Output expected:
MEMDICT = {'.text.resident' : {'Budget':'712924', 'Size':'794576', 'Prev Size': '832688' , 'Overflow': 'YES'},
'.rodata.resident' : {'Budget':'', 'Size':'', 'Prev Size': '' , 'Overflow': 'YES'},
'.data.resident' :{'Budget':'', 'Size':'', 'Prev Size': '' , 'Overflow': 'NO'},
'.bss.resident' :{'Budget':'', 'Size':'', 'Prev Size': '' , 'Overflow': 'YES'}}
I am a beginer in python. Please suggest some simple steps
Logic:
Search for a regex pattern and get the headers in a list
pattern = re.compile(r'\sSection Name\s|\sBudget*') # This can be improved,
if(pattern.match(line)):
key_list = (''.join(line.split())).split('|') # Unable to handle space issues, so trimmed and used.
Search for a regex pattern to match .something.resident | \d+ | \d+ | \d+ | **
Need some help and get it in value_list
Making all list into the dict in a loop
mem_info = {} # reset the list
for i in range(0,len(key_list)):
mem_info[key_list[i]] = value_list[i]
MEMDICT[sta_info[0]] = sta_info
The only thing you haven't shown us is what line ends the section. Other than that, this is what you need:
keeper = False
memdict = {}
for line in open(file):
if not keeper:
if 'Section Name' in line:
keeper = True
continue
if '-------------------' in line:
continue
if 'whatever ends the section' in line:
break
parts = line.split()
memdict[parts[0]] = {
'Budget': int(parts[1]),
'Size': int(parts[2]),
'Prev Size': int(parts[3]),
'Overflow': parts[4]
)

How do I transform a non-CSV text file into a CSV using Python/Pandas?

I have a text file that looks like this:
Id Number: 12345678
Location: 1234561791234567090-8.9
Street: 999 Street AVE
Buyer: john doe
Id Number: 12345688
Location: 3582561791254567090-8.9
Street: 123 Street AVE
Buyer: Jane doe # buyer % LLC
Id Number: 12345689
Location: 8542561791254567090-8.9
Street: 854 Street AVE
Buyer: Jake and Bob: Owner%LLC: Inc
I'd like the file to look like this:
Id Number
Location
Street
Buyer
12345678
1234561791234567090-8.9
999 Street AVE
john doe
12345688
3582561791254567090-8.9
123 Street AVE
Jane doe # buyer % LLC
12345689
8542561791254567090-8.9
854 Street AVE
Jake and Bob: Owner%LLC: Inc
I have tried the following:
# 1 Read text file and ignore bad lines (lines with extra colons thus reading as extra fields).
tr = pd.read_csv('C:\\File Path\\test.txt', sep=':', header=None, error_bad_lines=False)
# 2 Convert into a dataframe/pivot table.
ndf = pd.DataFrame(tr.pivot(index=None, columns=0, values=1))
# 3 Clean up the pivot table to remove NaNs and reset the index (line by line).
nf2 = ndf.apply(lambda x: x.dropna().reset_index(drop=True))
Here is where got the last line (#3): https://stackoverflow.com/a/62481057/10448224
When I do the above and export to CSV the headers are arranged like the following:
(index)
Street
Buyer
Id Number
Location
The data is filled in nicely but at some point the Buyer field becomes inaccurate but the rest of the fields are accurate through the entire DF.
My guesses:
When I run #1 part of my script I get the following errors 507 times:
b'Skipping line 500: expected 2 fields, saw 3\nSkipping line 728: expected 2 fields, saw 3\
At the tail end of the new DF I am missing exactly 507 entries for the Byer field. So I think when I drop my bad lines, the field is pushing my data up.
Pain Points:
The Buyer field will sometimes have extra colons and other odd characters. So when I try to use a colon as a delimiter I run into problems.
I am new to Python and I am very new to using functions. I primarily use Pandas to manipulate data at a somewhat basic level. So in the words of the great Michael Scott: "Explain it to me like I'm five." Many many thanks to anyone willing to help.
Here's what I meant by reading in and using split. Very similar to other answers. Untested and I don't recall if inputline include eol, so I stripped it too.
with open('myfile.txt') as f:
data = [] # holds database
record = {} # holds built up record
for inputline in f:
key,value = inputline.strip().split(':',1)
if key == "Id Number": # new record starting
if len(record):
data.append(record) # write previous record
record = {}
record.update({key:value})
if len(record):
data.append(record) # out final record
df = pd.DataFrame(data)
This is a minimal example that demonstrates the basics:
cat split_test.txt
Id Number: 12345678
Location: 1234561791234567090-8.9
Street: 999 Street AVE
Buyer: john doe
Id Number: 12345688
Location: 3582561791254567090-8.9
Street: 123 Street AVE
Buyer: Jane doe # buyer % LLC
Id Number: 12345689
Location: 8542561791254567090-8.9
Street: 854 Street AVE
Buyer: Jake and Bob: Owner%LLC: Inc
import csv
with open("split_test.txt", "r") as f:
id_val = "Id Number"
list_var = []
for line in f:
split_line = line.strip().split(':')
print(split_line)
if split_line[0] == id_val:
d = {}
d[split_line[0]] = split_line[1]
list_var.append(d)
else:
d.update({split_line[0]: split_line[1]})
list_var
[{'Id Number': ' 12345689',
'Location': ' 8542561791254567090-8.9',
'Street': ' 854 Street AVE',
'Buyer': ' Jake and Bob'},
{'Id Number': ' 12345678',
'Location': ' 1234561791234567090-8.9',
'Street': ' 999 Street AVE',
'Buyer': ' john doe'},
{'Id Number': ' 12345688',
'Location': ' 3582561791254567090-8.9',
'Street': ' 123 Street AVE',
'Buyer': ' Jane doe # buyer % LLC'}]
with open("split_ex.csv", "w") as csv_file:
field_names = list_var[0].keys()
csv_writer = csv.DictWriter(csv_file, fieldnames=field_names)
csv_writer.writeheader()
for row in list_var:
csv_writer.writerow(row)
I would try reading the file line by line, splitting the key-value pairs into a list of dicts to look something like:
data = [
{
"Id Number": 12345678,
"Location": 1234561791234567090-8.9,
...
},
{
"Id Number": ...
}
]
# easy to create the dataframe from here
your_df = pd.DataFrame(data)

error with regex matching over 2 source files, expected string or buffer

so I would like to from a input.txt file, create a two dictionaries
for example, here is sample of the input.txt file
#. VAR #first=Billy
#. VAR #last=Bob
#. PRINT VARS
#. VAR #petName=Gato
#. VAR #street="1234 Home Street"
#. VAR #city="New York"
#. VAR #state=NY
#. VAR #zip=21236
#. VAR #title=Dr.
#. PRINT VARS
#. FORMAT LM=5 JUST=LEFT
#. PRINT FORMAT
so VAR #varName=value
i.e in the case of #first=Billy you would get something like varDict = {"first": "Billy"} right?
Now I wanna know how to do that thru the entire file
There are two dictionaries that I would need to populate, one for the variables, and one for FORMAT, which just holds values, doesn't actually do anything for now.
As far as a desired output, In the input file, there are commands that when read, will trigger to either add variables to the directory, or print that directory, or add to the format directory. I would use the pprint function like this pprint.pprint(varDict , width=30) and would output something like this
{'first': 'Billy',
'last': 'Bob'}
{'city': 'New York',
'first': 'Billy',
'last': 'Bob',
'petName': 'Gato',
'state': 'NY',
'street': '1234 Home Street',
'title': 'Dr.',
'zip': '21236'}
{'BULLET': 'o',
'FLOW': 'YES',
'JUST': 'LEFT',
'LM': '5',
'RM': '80'}
Unfortunately i keep getting errors all over the place on the driver and source file
AttributeError: 'list' object has no attribute 'groups'
TypeError: expected string or buffer
Driver.py
input=(sys.argv[1])
# Group 1. VAR
# Group 2. #first=Mae or JUST=RIGHT FLOW=NO
# pass Group 2 as atString
regexSearch = re.compile(r'^#. ([A-Z]+) (.*)', re.MULTILINE)
regexPrintVAR = re.compile(r'^#\.\s*PRINT\s(VARS)', re.MULTILINE)
regexPrintFORMAT = re.compile(r'^#\.\s*PRINT\s(FORMAT)',re.MULTILINE)
regexERRCheck = re.compile(r'^#\.\s*FORMAT\s+BAD', re.MULTILINE)
varDictionary = dict()
formatDictionary = {"FLOW":"YES", "LM":"1", "RM":"80","JUST":"LEFT","BULLET":"o"}
file = open(input, "r")
while True:
inputLine = file.readline()
matchObj = regexSearch.search(inputLine)
command, atString = matchObj.groups()
if command == "VAR":
setVariable(atString,varDictionary)
if command == "FORMAT":
formatListERR = regexERRCheck.search(inputLine)
if formatListERR != None:
print("*** Not a recognizable command")
line = file.readline()
setFormat(atString, formatDictionary)
if command == "PRINT":
printVARObj = regexPrintVAR.search(inputLine)
printFormatObj = regexPrintFORMAT.search(inputLine)
if printVARObj != None:
pprint.pprint(varDictionary, width=30)
elif printFormatObj != None:
pprint.pprint(formatDict, width=30)
inputLine = file.readline()
file.close()
importFileIUse.py
# The atString is the remainder of the string after the VAR or FORMAT key word.
varDictionary = dict()
formatDictionary = {"FLOW":"YES", "LM":"1", "RM":"80","JUST":"LEFT","BULLET":"o"}
def setFormat(atString,formatDictionary):
regexFormat = re.compile(r'((?:(?:\w+)=(?:\w+)\s*)*)$')
line = re.split(" +", atString)
formatList = regexFormat.search(line)
if formatList:
for param in formatList[0].split():
splitParam = param.split('=')
formatDictionary[splitParam[0]] = splitParam[1]
def setVariable (atString, varDictionary):
regexVAR = re.compile(r'#(\w+)=(\w+|.*)\s*$', re.MULTILINE)
# file = open(input)
# line = file.readline()
# line = re.split(" +", atString)
#while line:
varList = regexVAR.findall(atString)
for key, value in varList:
varDictionary[key] = value

parse string into list based on input list

I would like to write a function in python3 to parse a string based on the input list element. The following function works but is there a better way to do it?
def func(oStr, s_s):
if not oStr:
return s_s
elif '' in s_s:
return [oStr]
else:
for x in s_s:
st = oStr.find(x)
end = st + len(x)
res.append(oStr[st:end])
oStr = oStr.replace(x, '')
if oStr:
res.append(oStr)
return res
case 1
o_str = 'ABCNew York - Address'
s_str = ['ABC']
return ['ABC', 'New York - Address']
case 2
o_str = 'New York Friend Add | NumberABCNewYork Name | FirstName Last Name | time : Jan-31-2017'
s_str = ['New York Friend Add | Number', 'ABC', 'NewYork Name | FirstName Last Name | time: Jan-31-2017']
return ['New York Friend Add | Number', 'ABC', 'NewYork Name | FirstName Last Name | time: Jan-31-2017']
case 3
o_str = '-'
s_str = ['']
return ['-']
case 4
o_str = '1'
s_str = ['']
return ['1']
case 5
o_str = '1234Family-Name'
s_str = ['1234']
return ['1234', 'Family-Name']
case 6
o_str = ''
s_str = ['12345667', 'name']
return ['12345667', 'name']
To use a string like an array, you would just program it in the same way. For example
myStr="Hello, World!"
myString.insert(len(myString),"""Your character here""")
For your purposes .append() would work exactly the same way. Hope I helped.

Categories