split list with multiple delimiter - python

I have a list containing string from lines in txt file.
import csv
import re
from collections import defaultdict
parameters = ["name", "associated-interface", "type", "subnet", "fqdn", "wildcard-fqdn", "start-ip", "end-ip", "comment"]
address_dict = defaultdict(dict)
address_statements = []
with open("***somepaths**\\file.txt",
"r") as address:
in_address = False
for line in address:
line = line.strip()
#print (line)
if in_address and line != "next":
if line == "end":
break
address_statements.append(line)
else:
if line == "config firewall address":
in_address = True
#print(address_statements)
if address_statements:
for statement in address_statements:
op, param, *val = statement.split()
if op == "edit":
address_id = param
elif op == "set" and param in parameters:
address_dict[address_id][param] = ' '.join(val)
# output to the CSV
with open("***somepaths**\\file.csv", "w",
newline='') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=parameters)
writer.writeheader()
for key in address_dict:
address_dict[key]['name'] = key
writer.writerow(address_dict[key])
output should be like this: edit "name test" but it turn out to emit the space after the name and be like this: edit "name
How can I include everything in the double quotes?

You are using
op, param, *val = statement.split()
which splits at spaces - a line of
edit "CTS SVR"'
will put '-edit' into op, '"CTS' into param and the remainder of the line (split at spaces as list) into val: ['SVR"'].
You need a way to Split a string by spaces preserving quoted substrings - if you have params that are internally seperated by spaces and delimited by quote.
Inspired by this answer the csv module gives you what you need:
t1 = 'edit1 "some text" bla bla'
t2 = 'edit2 "some text"'
t3 = 'edit3 some thing'
import csv
reader = csv.reader([t1,t2,t3], delimiter = " ", skipinitialspace = True)
for row in reader:
op, param, *remainder = row
print(op,param,remainder, sep = " --> ")
Output:
edit1 --> some text --> ['bla', 'bla']
edit2 --> some text --> []
edit3 --> some --> ['thing']
You can apply the reader to one line only ( reader = csv.reader([line], delimiter = " ") ).
Probably a duplicate of Split a string by spaces preserving quoted substrings - I closevoted earlier on the question and cannot vote for duplicate anymore - hence the detailed answer.

Related

Converting text file to YAML in Python

I have a text file to convert to YAML format. Here are some notes to describe the problem a little better:
The sections within the file have a different number of subheadings to each other.
The values of the subheadings can be any data type (e.g. string, bool, int, double, datetime).
The file is approximately 2,000 lines long.
An example of the format is below:
file_content = '''
Section section_1
section_1_subheading1 = text
section_1_subheading2 = bool
end
Section section_2
section_2_subheading3 = int
section_2_subheading4 = double
section_2_subheading5 = bool
section_2_subheading6 = text
section_2_subheading7 = datetime
end
Section section_3
section_3_subheading8 = numeric
section_3_subheading9 = int
end
'''
I have tried to convert the text to YAML format by:
Replacing the equal signs with colons using regex.
Replacing Section section_name with section_name :.
Removing end between each section.
However, I am having difficulty with #2 and #3. This is the text-to-YAML function I have created so far:
import yaml
import re
def convert_txt_to_yaml(file_content):
"""Converts a text file to a YAML file"""
# Replace "=" with ":"
file_content2 = file_content.replace("=", ":")
# Split the lines
lines = file_content2.splitlines()
# Define section headings to find and replace
section_names = "Section "
section_headings = r"(?<=Section )(.*)$"
section_colons = r"\1 : "
end_names = "end"
# Convert to YAML format, line-by-line
for line in lines:
add_colon = re.sub(section_headings, section_colons, line) # Add colon to end of section name
remove_section_word = re.sub(section_names, "", add_colon) # Remove "Section " in section header
line = re.sub(end_names, "", remove_section_word) # Remove "end" between sections
# Join lines back together
converted_file = "\n".join(lines)
return converted_file
I believe the problem is within the for loop - I can't manage to figure out why the section headers and endings aren't changing. It prints perfectly if I test it, but the lines themselves aren't saving.
The output format I am looking for is the following:
file_content = '''
section_1 :
section_1_subheading1 : text
section_1_subheading2 : bool
section_2 :
section_2_subheading3 : int
section_2_subheading4 : double
section_2_subheading5 : bool
section_2_subheading6 : text
section_2_subheading7 : datetime
section_3 :
section_3_subheading8 : numeric
section_3_subheading9 : int
'''
I would rather convert it to dict and then format it as yaml using the yaml package in python as below:
import yaml
def convert_txt_to_yaml(file_content):
"""Converts a text file to a YAML file"""
config_dict = {}
# Split the lines
lines = file_content.splitlines()
section_title=None
for line in lines:
if line=='\n':
continue
elif re.match('.*end$', line):
#End of section
section_title=None
elif re.match('.*Section\s+.*', line):
#Start of Section
match_obj = re.match(".*Section\s+(.*)", line)
section_title=match_obj.groups()[0]
config_dict[section_title] = {}
elif section_title and re.match(".*{}_.*\s+=.*".format(section_title), line):
match_obj = re.match(".*{}_(.*)\s+=(.*)".format(section_title), line)
config_dict[section_title][match_obj.groups()[0]] = match_obj.groups()[1]
return yaml.dump(config_dict )

Python3 - Nested dict to JSON

I am trying to convert multiple .txt file to "table-like" data (with columns and rows). Each .txt file should be considered as a new column.
Consider below content of the .txt file:
File1.txt
Hi there
How are you doing?
What is your name?
File2.txt
Hi
Great!
Oliver, what's yours?
I have created a simple method, that accepts the file and and integer (the file number from another method):
def txtFileToJson(text_file, column):
data = defaultdict(list)
i = int(1)
with open(text_file) as f:
data[column].append(column)
for line in f:
i = i + 1
for line in re.split(r'[\n\r]+', line):
data[column] = line
with open("output.txt", 'a+') as f:
f.write(json.dumps(data))
So above method will run two times (one time for each file, and append the data).
This is the output.txt file after I have run my script:
{"1": "What is your name?"}{"2": "Oliver, what's yours?"}
As you can see, I can only get it to create a new for each file I have, and then add the entire line.
[{
"1": [{
"1": "Hi there",
"2": "How are you doing?",
"3": "\n"
"4": "What is your name?"
},
"2": [{
"1": "Hi"
"2": "Great!",
"3": "\n",
"4": "Oliver, what's yours?"
},
}]
Update:
OK, so I played around a bit and got a bit closer:
myDict = {str(column): []}
i = int(1)
with open(text_file) as f:
for line in f:
# data[column].append(column)
match = re.split(r'[\n\r]+', line)
if match:
myDict[str(column)].append({str(i): line})
i = i + 1
with open(out_file, 'a+') as f:
f.write(json.dumps(myDict[str(column)]))
That gives me below output:
[{"1": "Hi there\n"}, {"2": "How are you doing?\n"}, {"3": "\n"}, {"4": "What is your name?"}]
[{"1": "Hi\n"}, {"2": "Great!\n"}, {"3": "\n"}, {"4": "Oliver, what's yours?"}]
But as you can see, now I have multiple JSON root elements.
Solution
Thanks to jonyfries, I did this:
data = defaultdict(list)
for path in images.values():
column = column + 1
data[str(column)] = txtFileToJson(path, column)
saveJsonFile(path, data)
And then added a new method to save the final combined list:
def saveJsonFile(text_file, data):
basename = os.path.splitext(os.path.basename(text_file))
dir_name = os.path.dirname(text_file) + "/"
text_file = dir_name + basename[0] + "1.txt"
out_file = dir_name + 'table_data.txt'
with open(out_file, 'a+') as f:
f.write(json.dumps(data))
You're creating a new dictionary within the function itself. So each time you pass a text file in it will create a new dictionary.
The easiest solution seems to be returning the dictionary created and add it to an existing dictionary.
def txtFileToJson(text_file, column):
myDict = {str(column): []}
i = int(1)
with open(text_file) as f:
for line in f:
# data[column].append(column)
match = re.split(r'[\n\r]+', line)
if match:
myDict[str(column)].append({str(i): line})
i = i + 1
with open(out_file, 'a+') as f:
f.write(json.dumps(myDict[str(column)]))
return myDict
data = defaultdict(list)
data["1"] = txtFileToJson(text_file, column)
data["2"] = txtFileToJson(other_text_file, other_column)
def read(text_file):
data, i = {}, 0
with open(text_file) as f:
for line in f:
i = i + 1
data['row_%d'%i] = line.rstrip('\n')
return data
res = {}
for i, fname in enumerate([r'File1.txt', r'File2.txt']):
res[i] = read(fname)
with open(out_file, 'w') as f:
json.dump(res, f)
First, if I understand you are trying to get as output a dictionary of dictionaries, then let me observe that what I understand to be your desired output seems to be enclosing the whole thing within a list, Furthermore, you have unbalanced open and closed list brackets within the dictionaries, which I will ignore, as I will the enclosing list.
I think you need something like:
#!python3
import json
import re
def processTxtFile(text_file, n, data):
d = {}
with open(text_file) as f:
i = 0
for line in f:
for line in re.split(r'[\n\r]+', line):
i = i + 1
d[str(i)] = line
data[str(n)] = d
data = dict()
processTxtFile('File1.txt', 1, data)
processTxtFile('File2.txt', 2, data)
with open("output.txt", 'wt') as f:
f.write(json.dumps(data))
If you really need the nested dictionaries to be enclosed within a list, then replace
data[str(n)] = d
with:
data[str(n)] = [d]

Converting Fixed-Width File to .txt then the .txt to .csv

I have a fixed-width file that I have no issues importing and splitting into 31 txt files. The spaces from the fixed-width file are conserved in this process since the writing to the txt simply writes each entry from the fixed-width file as a new line.
My issue is that when I use python's csv function these spaces are replaced with "(a quotation mark) as a place holder.
I'm looking to see if there is a way to have a csv file produced without these double quotes as place holders while maintaining the required formatting initially set in the fixed-width file.
Initial line in txt doc:
'PAY90004100095206 9581400086000909 0008141000 5350 3810 C 000021841998051319980513P810406247 FELT, MARTIN & FRAZIER, P.C. FELT, MARTIN & FRAZIER, P.C. 208 NORTH BROADWAY STE 313 BILLINGS MT59101-0 NLance Martin v. Whitman College N00000000NN98004264225 SYS656 19980512+000000378761998041319980421+000000378769581400086000909 000+000000 Lance Martin v. Whitman College 00000000 00010001 +00000000000002184 000000021023.005000000003921.005\n'
.py:
import csv
read_loc = 'c:/Users/location/e0290000005.txt'
e02ext_start = read_loc.find('e02')
e02_ext = read_loc[int(e02ext_start):]
with open(read_loc, 'r') as f:
contents = f.readlines()
dict_of_record_lists = {}
# takes first 3 characters of each line and if a matching dictionary key is found
# it appends the line to the value-list
for line in contents:
record_type = (line[:3])
dict_of_record_lists.setdefault(record_type,[]).append(line)
slice_list_CLM = [(0,3),(3,7),(7,15),(15,21),(21,39),(39,42),(42,45),(45,47),(47,55),(55,59),(59,109),(109,189),(189,191),(191,193),(193,194),(194,195),(195,203),(203,211),(211,219),(219,227),(227,235),(235,237),(237,239),(239,241),(241,245),(245,249),(249,253),(253,257),(257,261),(261,291),(291,316),(316,331),(331,332),(332,357),(357,377),(377,378),(378,408),(408,438),(438,468),(468,470),(470,485),(485,505),(505,514),(514,517),(517,525),(525,533),(533,535),(535,536),(536,537),(537,545),(545,551),(551,553),(553,568),(568,572),(572,587),(587,602),(602,627),(627,631),(631,638),(638,642),(642,646),(646,654),(654,662),(662,670),(670,672),(672,674),(674,675),(675,676),(676,682),(682,700),(700,708),(708,716),(716,717),(717,725),(725,733),(733,741),(741,749),(749,759),(759,761),(761,762),(762,763),(763,764),(764,765),(765,768),(768,769),(769,770),(770,778),(778,779),(779,783),(783,787),(787,788),(788,805),(805,817),(817,829),(829,833),(833,863),(863,893),(893,896),(896,897),(897,898),(898,928),(928,936),(936,944),(944,945),(945,947),(947,959),(959,971),(971,983),(983,995),(995,1007),(1007,1019),(1019,1031),(1031,1043),(1043,1055),(1055,1067),(1067,1079),(1079,1091),(1091,1103),(1103,1115),(1115,1127),(1127,1139),(1139,1151),(1151,1163),(1163,1175),(1175,1187),(1187,1197),(1197,1202),(1202,1203),(1203,1211),(1211,1214),(1214,1215),(1215,1233),(1233,1241),(1241,1257),(1257,1272),(1272,1273),(1273,1285),(1285,1289),(1289,1293),(1293,1343),(1343,1365),(1365,1685),(1685,1686),(1686,1704),(1704,1708),(1708,1748),(1748,1768),(1768,1770),(1770,1772),(1772,1773),(1773,1782),(1782,1784),(1784,1792),(1792,1793),(1793,1796),(1796,1800)]
slice_list_CTL = [(0,3),(3,7),(7,15),(15,23),(23,31),(31,39),(39,47),(47,55),(55,56),(56,65),(65,74),(74,83),(83,98),(98,113),(113,128),(128,143),(143,158),(158,173),(173,188),(188,203),(203,218),(218,233),(233,248),(248,263),(263,278),(278,293),(293,308),(308,323),(323,338),(338,353),(353,368),(368,383),(383,398),(398,413),(413,428),(428,443),(443,458),(458,473),(473,488),(488,503),(503,518),(518,527),(527,536),(536,545),(545,554),(554,563),(563,572),(572,581),(581,590),(590,599),(599,614),(614,623),(623,638),(638,647),(647,662),(662,671),(671,686),(686,695),(695,710),(710,719),(719,728),(728,737),(737,746),(746,755),(755,764),(764,773),(773,782),(782,791),(791,800),(800,809),(809,818),(818,827),(827,836),(836,845),(845,854),(854,863),(863,872),(872,881),(881,890),(890,899),(899,908)]
slice_list_ADR = [(0,3),(3,7),(7,15),(15,21),(21,39),(39,42),(42,45),(45,50),(50,53),(53,62),(62,65),(65,66),(66,91),(91,111),(111,121),(121,151),(151,181),(181,206),(206,208),(208,223),(223,243),(243,261),(261,265),(265,283),(283,287),(287,305),(305,335),(335,375),(375,383),(383,387),(387,437),(437,438),(438,446),(446,454),(454,461),(461,468),(468,484),(484,500)]
slice_list_AGR = [(0,3),(3,7),(7,45),(45,85),(85,93),(93,101),(101,109),(109,117),(117,127),(127,139),(139,151)]
slice_list_ACN = [(0,3),(3,7),(7,15),(15,21),(21,39),(39,42),(42,45),(45,65),(65,95),(95,115),(115,145),(145,165),(165,195),(195,215),(215,245),(245,265),(265,295),(295,315),(315,345),(345,365),(365,395),(395,415),(415,445),(445,465),(465,495),(495,515),(515,545),(545,565),(565,595),(595,615),(615,645),(645,665),(665,695),(695,715),(715,745),(745,765),(765,795),(795,815),(815,845),(845,865),(865,895),(895,915),(915,945),(945,965),(965,995),(995,1015),(1015,1045),(1045,1061)]
slice_list_CST = [(0,3),(3,7),(7,15),(15,21),(21,39),(39,42),(42,45),(45,53),(53,59),(59,60),(60,61),(61,62),(62,64),(64,80),(80,82),(82,84),(84,86),(86,88),(88,104)]
slice_list_MCF = [(0,3),(3,7),(7,15),(15,21),(21,39),(39,42),(42,45),(45,49),(49,79),(79,94),(94,159),(159,175),(175,191)]
slice_list_DD1 = [(0,3),(3,7),(7,15),(15,21),(21,39),(39,42),(42,45),(45,46),(46,54),(54,62),(62,63),(63,69),(69,75),(75,81),(81,87),(87,93),(93,94),(94,95),(95,103),(103,111),(111,119),(119,126),(126,134),(134,143),(143,154),(154,162),(162,170),(170,178),(178,186),(186,194),(194,202),(202,205),(205,208),(208,210),(210,218),(218,220),(220,228),(228,230),(230,238),(238,240),(240,248),(248,250),(250,258),(258,274)]
slice_list_DES = [(0,3),(3,7),(7,15),(15,21),(21,39),(39,42),(42,1300),(1300,1316)]
slice_list_IBC = [(0,3),(3,7),(7,15),(15,21),(21,39),(39,42),(42,45),(45,48),(48,50),(50,54),(54,55),(55,56),(56,81),(81,101),(101,121),(121,124),(124,125),(125,145),(145,146),(146,149),(149,152),(152,154),(154,179),(179,199),(199,219),(219,222),(222,224),(224,227),(227,230),(230,238),(238,249),(249,265),(265,281)]
slice_list_ICD = [(0,3),(3,7),(7,15),(15,21),(21,39),(39,42),(42,45),(45,57),(57,63),(63,69),(69,75),(75,81),(81,87),(87,95),(95,103),(103,111),(111,114),(114,122),(122,125),(125,126),(126,142),(142,144),(144,152),(152,154),(154,162),(162,164),(164,172),(172,174),(174,182),(182,184),(184,192),(192,208)]
slice_list_LEG = [(0,3),(3,7),(7,15),(15,21),(21,39),(39,42),(42,45),(45,53),(53,61),(61,65),(65,73),(73,81),(81,82),(82,90),(90,98),(98,133),(133,148),(148,163),(163,164),(164,172),(172,180),(180,181),(181,216),(216,256),(256,296),(296,326),(326,356),(356,381),(381,383),(383,398),(398,418),(418,438),(438,456),(456,474),(474,509),(509,549),(549,589),(589,619),(619,649),(649,674),(674,676),(676,691),(691,711),(711,731),(731,749),(749,767),(767,782),(782,790),(790,798),(798,806),(806,810),(810,818),(818,826),(826,834),(834,840),(840,849),(849,879),(879,888),(888,918),(918,920),(920,921),(921,923),(923,931),(931,939),(939,943),(943,944),(944,952),(952,960),(960,990),(990,1020),(1020,1050),(1050,1051),(1051,1086),(1086,1095),(1095,1135),(1135,1175),(1175,1205),(1205,1235),(1235,1260),(1260,1262),(1262,1277),(1277,1295),(1295,1304),(1304,1312),(1312,1328)]
slice_list_LD1 = [(0,3),(3,7),(7,15),(15,21),(21,39),(39,42),(42,45),(45,65),(65,95),(95,125),(125,150),(150,152),(152,167),(167,187),(187,205),(205,223),(223,227),(227,252),(252,267),(267,279),(279,309),(309,339),(339,359),(359,361),(361,376),(376,396),(396,414),(414,439),(439,440),(440,448),(448,454),(454,456),(456,871),(471,472),(472,492),(492,522),(522,552),(552,572),(572,574),(574,589),(589,609),(609,627),(627,637),(637,645),(645,685),(685,686),(686,706),(706,714),(714,744),(744,774),(774,794),(794,796),(796,811),(811,831),(831,849),(849,879),(879,909),(909,929),(929,931),(931,946),(946,966),(966,984),(984,992),(992,1004),(1004,1024),(1024,1064),(1064,1081),(1081,1098),(1098,1106),(1106,1121),(1121,1122),(1122,1152),(1152,1153),(1153,1162),(1162,1170),(1170,1185),(1185,1190),(1190,1220),(1220,1238),(1238,1253),(1253,1283),(1283,1301),(1301,1302),(1302,1303),(1303,1333),(1333,1363),(1363,1388),(1388,1390),(1390,1405),(1405,1406),(1406,1436),(1436,1442),(1442,1462),(1462,1463),(1463,1478),(1478,1493),(1493,1533),(1533,1535),(1535,1538),(1538,1540),(1540,1556),(1556,1756)]
slice_list_LD2 = [(0,3),(3,7),(7,15),(15,21),(21,39),(39,42),(42,45),(45,60),(60,78),(78,118),(118,148),(148,178),(178,203),(203,205),(205,220),(220,238),(238,256),(256,260),(260,270),(270,290),(290,300),(300,302),(302,322),(322,352),(352,377),(377,397),(397,398),(398,423),(423,424),(424,454),(454,455),(455,456),(456,458),(458,474)]
slice_list_LD3 = [(0,3),(3,7),(7,15),(15,21),(21,39),(39,42),(42,45),(45,46),(46,71),(71,91),(91,92),(92,122),(122,152),(152,177),(177,179),(179,194),(194,197),(197,205),(205,213),(213,221),(221,229),(229,237),(237,297),(297,305),(305,313),(313,321),(321,329),(329,337),(337,345),(345,353),(353,361),(361,421),(421,429),(429,489),(489,497),(497,557),(557,617),(617,633)]
slice_list_NET = [(0,3),(3,7),(7,15),(15,21),(21,39),(39,42),(42,45),(45,53),(53,61),(61,69),(69,77),(77,88),(88,99),(99,105),(105,135),(135,146),(146,152),(152,182),(182,193),(193,199),(199,229),(229,240),(240,246),(246,276),(276,287),(287,293),(293,323),(323,334),(334,340),(340,370),(370,381),(381,387),(387,417),(417,428),(428,434),(434,464),(464,475),(475,481),(481,511),(511,522),(522,528),(528,558),(558,569),(569,575),(575,605),(605,616),(616,622),(622,652),(652,663),(663,669),(669,699),(699,710),(710,716),(716,746),(746,757),(757,763),(763,793),(793,804),(804,810),(810,840),(840,851),(851,857),(857,887),(887,898),(898,904),(904,934),(934,945),(945,951),(951,981),(981,992),(992,998),(998,1028),(1028,1039),(1039,1047),(1047,1055),(1055,1061),(1061,1077),(1077,1087),(1087,1103)]
slice_list_NOT = [(0,3),(3,7),(7,15),(15,21),(21,39),(39,42),(42,45),(45,47),(47,55),(55,63),(63,71),(71,77),(77,79),(79,1279),(1279,1295),(1295,1296),(1296,1312)]
slice_list_OFF = [(0,3),(3,7),(7,15),(15,21),(21,39),(39,42),(42,45),(45,75),(75,78),(78,93),(93,105),(105,107),(107,115),(115,123),(123,131),(131,132),(132,148)]
slice_list_PAY = [(0,3),(3,7),(7,15),(15,21),(21,39),(39,42),(42,45),(45,60),(60,61),(61,65),(65,73),(73,81),(81,89),(89,90),(90,130),(130,165),(165,205),(205,245),(245,275),(275,305),(305,330),(330,332),(332,347),(347,367),(367,368),(368,428),(428,429),(429,437),(437,438),(438,439),(439,450),(450,452),(452,455),(455,458),(458,473),(473,481),(481,493),(493,501),(501,509),(509,521),(521,539),(539,542),(542,549),(549,552),(552,562),(562,567),(567,627),(627,635),(635,643),(643,647),(647,651),(651,653),(653,654),(654,684),(684,692),(692,702),(702,713),(713,1034),(1034,1050),(1050,1066)]
slice_list_PRC = [(0,3),(3,7),(7,15),(15,21),(21,39),(39,42),(42,45),(45,46),(46,51),(51,81),(81,84),(84,87),(87,95),(95,103),(103,119),(119,125),(125,131),(131,147)]
slice_list_ACR = [(0,3),(3,7),(7,15),(15,21),(21,39),(39,42),(42,45),(45,51),(51,59),(59,71),(71,79),(79,91),(91,103),(103,119),(119,135)]
slice_list_REC = [(0,3),(3,7),(7,15),(15,21),(21,39),(39,42),(42,45),(45,58),(58,71),(71,84),(84,97),(97,110),(110,123),(123,136),(136,149),(149,162),(162,175),(175,188),(188,201),(201,214),(214,227),(227,240),(240,253),(253,266),(266,279),(279,292),(292,305),(305,318),(318,331),(331,344),(344,357),(357,370),(370,383),(383,396),(396,409),(409,422),(422,435),(435,448),(448,461),(461,474),(474,487),(487,500),(500,513),(513,526),(526,539),(539,552),(552,565),(565,578),(578,591),(591,604),(604,617),(617,630),(630,643),(643,656),(656,669),(669,682),(682,695),(695,708),(708,721),(721,734),(734,747),(747,760),(760,773),(773,786),(786,799),(799,812),(812,825),(825,838),(838,851),(851,864),(864,877),(877,890),(890,903),(903,916),(916,929),(929,942),(942,955),(955,968),(968,981),(981,997)]
slice_list_RED = [(0,3),(3,7),(7,15),(15,21),(21,39),(39,42),(42,45),(45,57),(57,69),(69,81),(81,93),(93,105),(105,117),(117,129),(129,141),(141,157)]
slice_list_REI = [(0,3),(3,7),(7,15),(15,21),(21,39),(39,42),(42,45),(45,61),(61,67),(67,87),(87,88),(88,100),(100,108),(108,116),(116,176),(176,192),(192,193),(193,199),(199,214),(214,222),(222,230),(230,238),(238,250),(250,251),(251,311),(311,327)]
slice_list_RES = [(0,3),(3,7),(7,15),(15,21),(21,39),(39,42),(42,45),(45,46),(46,54),(54,134),(134,136),(136,148),(148,160),(160,172),(172,184),(184,196),(196,208),(208,220),(220,232),(232,242),(242,252),(252,262),(262,272),(272,282),(282,292),(292,299),(299,309),(309,319),(319,329),(329,339),(339,349),(349,359),(359,369),(369,379),(379,389),(389,399),(399,409),(409,419),(419,429),(429,439),(439,449),(449,465),(465,475),(475,975),(975,991)]
slice_list_RST = [(0,3),(3,7),(7,15),(15,21),(21,39),(39,42),(42,45),(45,53),(53,61),(61,69),(69,77),(77,87),(87,95),(95,125),(125,145),(145,161),(161,177)]
slice_list_SPC = [(0,3),(3,7),(7,15),(15,21),(21,39),(39,42),(42,45),(45,53),(53,61),(61,69),(69,77),(77,85),(85,93),(93,101),(101,109),(109,117),(117,125),(125,133),(133,149)]
slice_list_SSN = [(0,3),(3,7),(7,15),(15,21),(21,39),(39,42),(42,45),(45,54),(54,62),(62,74),(74,82),(82,94),(94,102),(102,114),(114,122),(122,134),(134,142),(142,143),(143,151),(151,159),(159,160),(160,168),(168,176),(176,177),(177,185),(185,193),(193,194),(194,202),(202,210),(210,211),(211,219),(219,220),(220,228),(228,268),(268,276),(276,277),(277,293)]
slice_list_WRK = [(0,3),(3,7),(7,15),(15,21),(21,39),(39,42),(42,45),(45,53),(53,57),(57,72),(72,73),(73,81),(81,82),(82,90),(90,98),(98,106),(106,114),(114,122),(122,130),(130,131),(131,132),(132,133),(133,153),(153,154),(154,155),(155,159),(159,179),(179,180),(180,240),(240,248),(248,256),(256,264),(264,272),(272,280),(280,284),(284,288),(288,298),(298,314),(314,330)]
slice_list_WD1 = [(0,3),(3,7),(7,15),(15,21),(21,39),(39,42),(42,45),(45,54),(54,58),(58,59),(59,60),(60,61),(61,63),(63,73),(73,74),(74,82),(82,83),(83,91),(91,99),(99,107),(107,108),(108,118),(118,120),(120,130),(130,137),(137,139),(139,149),(149,156),(156,158),(158,168),(168,175),(175,177),(177,187),(187,194),(194,196),(196,206),(206,213),(213,223),(223,233),(233,243),(243,253),(253,263),(263,273),(273,283),(283,293),(293,303),(303,311),(311,314),(314,322),(322,332),(332,342),(342,352),(352,353),(353,354),(354,355),(355,365),(365,375),(375,385),(385,395),(395,405),(405,415),(415,425),(425,435),(435,436),(436,437),(437,438),(438,439),(439,440),(440,442),(442,443),(443,444),(444,445),(445,446),(446,448),(448,458),(458,460),(460,470),(470,472),(472,482),(482,484),(484,494),(494,496),(496,506),(506,508),(508,518),(518,528),(528,542),(542,543),(543,551),(551,559),(559,561),(561,565),(565,567),(567,574),(574,582),(582,583),(583,584),(584,585),(585,593),(593,594),(594,595),(595,596),(596,604),(604,605),(605,606),(606,607),(607,615),(615,616),(616,617),(617,618),(618,626),(626,627),(627,628),(628,629),(629,637),(637,645),(645,653),(653,661),(661,669),(669,677),(677,685),(685,693),(693,701),(701,709),(709,717),(717,721),(721,729),(729,732),(732,734),(734,738),(738,746),(746,749),(749,751),(751,755),(755,763),(763,766),(766,774),(774,782),(782,790),(790,798),(798,800),(800,801),(801,802),(802,813),(813,829)]
slice_list_WD3 = [(0,3),(3,7),(7,15),(15,21),(21,39),(39,42),(42,45),(45,46),(46,47),(47,48),(48,49),(49,50),(50,51),(51,52),(52,53),(53,54),(54,55),(55,56),(56,57),(57,58),(58,98),(98,138),(138,178),(178,182),(182,183),(183,191),(191,197),(197,213)]
slice_dict = {
'CLM' : slice_list_CLM,
'CTL' : slice_list_CTL,
'ADR' : slice_list_ADR,
'AGR' : slice_list_AGR,
'ACN' : slice_list_ACN,
'CST' : slice_list_CST,
'MCF' : slice_list_MCF,
'DD1' : slice_list_DD1,
'DES' : slice_list_DES,
'IBC' : slice_list_IBC,
'ICD' : slice_list_ICD,
'LEG' : slice_list_LEG,
'LD1' : slice_list_LD1,
'LD2' : slice_list_LD2,
'LD3' : slice_list_LD3,
'NET' : slice_list_NET,
'NOT' : slice_list_NOT,
'OFF' : slice_list_OFF,
'PAY' : slice_list_PAY,
'PRC' : slice_list_PRC,
'ACR' : slice_list_ACR,
'REC' : slice_list_REC,
'RED' : slice_list_RED,
'REI' : slice_list_REI,
'RES' : slice_list_RES,
'RST' : slice_list_RST,
'SPC' : slice_list_SPC,
'SSN' : slice_list_SSN,
'WRK' : slice_list_WRK,
'WD1' : slice_list_WD1,
'WD3' : slice_list_WD3,
}
def slicer(file,slice_list):
csv_string = ""
for i in slice_list:
csv_string += (file[i[0]:i[1]]+",")
return csv_string
overview_loc = 'c:/Users/location/E02_ingestion/'+ 'overview_'+e02_ext #put in file location wehre you would like to see logs
with open(overview_loc, 'w') as overview_file:
for key, value in dict_of_record_lists.items():
overview_file.write((key+' '+(str(len(value)))+'\n'))
for key, value in dict_of_record_lists.items():
for k, v in slice_dict.items():
if key == k:
iteration = 0
for i in value:
s = slicer(i,v)
value[iteration] = s
iteration+= 1
e02_ext = read_loc[int(e02ext_start):]
csv_ext = e02_ext[:-3]+'csv'
# file overview/log that shows how many lines should exist in the other files to ensure everything wrote correctly
overview_loc = 'c:/Users/location/E02_ingestion/'+ 'overview_'+e02_ext #put in file location wehre you would like to see logs
with open(overview_loc, 'w') as overview_file:
for key, value in dict_of_record_lists.items():
overview_file.write((key+' '+(str(len(value)))+'\n'))
# if the list isn't empty writes a new file w/prefix matching key and includes the lines
for key, value in dict_of_record_lists.items():
write_loc = 'c:/Users/location/E02_ingestion/'+ key +'_'+e02_ext
with open(write_loc, "w", newline='') as parsed_file:
for line in value:
line_pre = "%s\n" % line
parsed_file.write(line_pre[:-1])
for key, value in dict_of_record_lists.items():
write_loc = 'c:/Users/location/E02_ingestion/'+ key +'_'+csv_ext
with open(write_loc, "w", newline='') as csvfile:
writer = csv.writer(csvfile, delimiter=' ')
for i in value:
writer.writerow(i)
This is a sample of a section of output in both Excel and our SQL table:
P A Y 9 0 0 0 4 1 0 0 0 9 5 2 0 7 " " " " " " " "
Desired output (void of " as place holders for spaces):
P A Y 9 0 0 0 4 1 0 0 0 9 5 2 0 7
Any help would be greatly appreciated.
Why:
The problem you are facing is, that you have list entries inside your row of processed data that only contain the csv.delimiter character. The module then quotes them to distinguish your "delimiter-only-data" from your "delimiter between columns".
When writing something like [ ["PAY","...."," "," "," "," "] ] into a csv using ' ' as divider you get them outputted quoted:
import csv
dict_of_record_lists = {"K": [ ["PAY","...."," "," "," "," "] ] }
for key, value in dict_of_record_lists.items():
write_loc = 't.txt'
with open(write_loc, "w", newline='') as csvfile:
writer = csv.writer(csvfile, delimiter=' ')
for i in value:
writer.writerow(i)
print( open(write_loc).read()) # PAY .... " " " " " " " "
Fix:
You can fix that specifying quoting=csv.QUOTE_NONE and provide a escapechar = ... or by fixing your data. Providing an escapechar would put that into your file though.
Relevant portions of the documentation: csv.QUOTE_NONE.
You can manipulate your data to not contain "only" the delimiters as data:
for key, value in dict_of_record_lists.items():
write_loc = 'c:/Users/location/E02_ingestion/'+ key +'_'+csv_ext
with open(write_loc, "w", newline='') as csvfile:
writer = csv.writer(csvfile, delimiter=' ')
for i in value:
# if an inner item only contains delimiter characters, set it to empty string
cleared = [x for x in i if i.strip(" ") else ""]
writer.writerow(cleared)
HTH
Doku:
https://docs.python.org/3/library/csv.html
Was able to change the initial text writing portion to:
for key, value in dict_of_record_lists.items():
write_loc = 'c:/Users/Steve Barnard/Desktop/Git_Projects/E02_ingestion/'+ key +'_'+csv_ext
with open(write_loc, "w", newline='') as parsed_file:
for line in value:
line_pre = "%s" % line
parsed_file.write(line_pre[:-1]+'\n')
All the issues were fixed by avoiding python's built-in CSV writer.
The way my program added a comma following the line slices, left with one additional comma and the '\n'; this led the [:-1] slice in the write function to remove the \n and not the final ','. by adding the '\n' following the comma removal the entire problem was fixed and a functioning CSV that retained the spacing was created.
A text file can be created by swapping out the extension upon writing.

Python- How do I update an index of a for loop that iterates over lines in a file?

Using a for loop, I'm iterating over the lines in a file. Given this line:
line= [ ‘641', '"Tornadus', ' (Incarnate Form)"', '"Flying"', '""', '5', '"TRUE"']
I need to reformat index [6] from '"TRUE"' to the boolean True.
Full expected output: d={'Tornadus, (Incarnate Form)': (641, 'Flying', None, 5, True}
I used:
if "T" in line[6]: # format legendary if TRUE
line[6] = True
But I get this error:
Traceback (most recent call last):
File "tester5p.py", line 305, in test_read_info_file_05
self.assertEqual(read_info_file(DATAFILE),info_db5())File "/Users/kgreenwo/Desktop/student.py", line 52, in read_info_file
line[5] = False
TypeError: 'str' object does not support item assignment
How can I assign it WITHIN the for loop?
To see my full code:
def read_info_file(filename):
f = open(filename, 'r') # open file in read mode
d = {} # intitialze as empty
count = 0 # helps to skip first line
key = ""
for line in f: # get each line from file
if count != 0: # skip first line
# 1___________________________________________open file,read, skip 1st line
id_num = int(line[0]) # make id an integer
# 2________________________________________________
if ',' in line[1]: # two parts to fullname, changes indexes
part1 = line[1].strip('"') # get format first part of name
part2 = line[2].strip() # get format second part of name
# 3______________
fullname = part1 + part2
key = fullname
# 4______________
type1 = line[3].strip('"')
# 5--------------
if line[4] == "": # check if there is not a second type
type2 = None # correct format
else: # is a second type
type2 = line[4].strip('"') # format second type
# 6______________
generation = line[5] # format generation
# 7_____________
if "T" in line[6]: # format legendary if TRUE
line[6] = True
legendary = line[6]
else: # format legendary if FALSE
line[6] = False
legendary = line[6]
# 8______________________________________________one part to name
else: # one part to name
fullname = line[1].strip('"')
# 9______________
type1 = line[2].strip('"')
# 10_____________
if line[3] == "": # if no second type
type2 = None
else:
type2 = line[3].strip('"') # there is a second type
# 11_____________
generation = line[4] # format generation
# 12_____________
if "T" in line[5]: # format legendary if TRUE
line[5] = True
legendary = line[5]
else: # formmat Legendary if False
line[5] = False
legendary = line[5]
value = (id_num, type1, type2, generation, legendary)
d.update([(key, value)])
count += 1
return d
Reproducible example:
input: (don't forget to skip first line!)
info_file1 = '''"ID","Name","Type 1","Type 2","Generation","Legendary"
1,"Bulbasaur","Grass","Poison",1,"FALSE"
Output:
d={'Bulbasaur':(1,'Grass','Poison',1,False)}
It is quite unclear from your example, but my thoughts go to:
for line in f:
line = line.split(',')
Now you can mess with indexes and see whether you have more errors.
And if you use:
if "T" in line[6]: # format legendary if TRUE
line[6] = True
It will work.
Your input file looks like a comma-separated values file. If it is, what you want is pretty easy.
Let's suppose your input file is literally this:
Input_file-43644346.txt
info_file1 = '''"ID","Name","Type 1","Type 2","Generation","Legendary"
1,"Bulbasaur","Grass","Poison",1,"FALSE"
641,"Tornadus', ' (Incarnate Form)","Flying",,5,"TRUE"
You could do something like that:
#!/usr/bin/env python3
import csv
input_file_name = "Input_file-43644346.txt"
with open(input_file_name, newline='') as input_file:
next(input_file) # skip first line
record_extractor = csv.reader(input_file)
d = {}
for row in record_extractor:
key = row[1].strip()
row_truth = row[5] == "TRUE" # simplifying the boolean retrieving
# Using conditional expressions
row_second_type = row[3].strip() if row[3] else None
output_row = (row[0], row[2], row_second_type, row[4], row_truth)
d[key] = output_row
print("d=", d)
Here are some key points of this solution:
This example is in Python 3's syntax
Using with makes sure that the input file is closed timely
Since a file object is also an iterator, you can skip the first line by using next().
csv.reader() will give you a tuple containing the information from a row. It will process quoted string like you would expect.
The expression row[5] == "TRUE" will yield a boolean expression. You don't need to use an if statement.
An empty string is equivalent to False. Any other string is True.
Conditional expressions can be used to change an empty string to None like you wanted.
dict.update() is useful if you already have a dictionary or a list of tuples you want to use its values to update an dictionary but you are better off using d[key] = value
But my guess is that your file is more like that:
Input_file-43644346b.txt
"ID","Name","Type 1","Type 2","Generation","Legendary"
1,"Bulbasaur","Grass","Poison",1,"FALSE"
641,"Tornadus', ' (Incarnate Form)","Flying",,5,"TRUE"
You can then use csv.DictReader to read your data:
#!/usr/bin/env python3
import csv
input_file_name = "Input_file-43644346b.txt"
with open(input_file_name, newline='') as input_file:
record_extractor = csv.DictReader(input_file)
d = {}
for row in record_extractor:
key = row["Name"].strip()
row_truth = row["Legendary"] == "TRUE"
row_second_type = row["Type 2"].strip() if row["Type 2"] else None
output_row = (row["ID"], row["Type 1"],
row_second_type, row["Generation"], row_truth)
d[key] = output_row
print("d=", d)
That enables you to use "column" names to identify different parts of each row
You can simplify even more your code by using a dictionary comprehension:
#!/usr/bin/env python3
import csv
input_file_name = "Input_file-43644346.txt"
with open(input_file_name, newline='') as input_file:
next(input_file) # skip first line
record_extractor = csv.reader(input_file)
d = { row[1]: (row[0],
row[2],
row[3].strip() if row[3] else None,
row[4],
row[5] == "TRUE")
for row in record_extractor }
print("d=", d)
Instead of reassigning it, I just did this and it worked:
if "T" in line[6]: # format legendary if TRUE
legendary = True
else: # format legendary if FALSE
legendary = False

Python extract values from text using keys

I have a text file in the following format of Key Value
--START--
FirstName Kitty
LastName McCat
Color Red
random_data
Meow Meow
--END--
I'm wanting to extract specific values from the text into a variable or a dict. For example if I want to extract the values of LastName and Color what would be the best way to do this?
The random_data may be anywhere in the file and span multiple lines.
I've considered using regex but am concerned with performance and readability as in the real code I have many different keys to extract.
I could also loop over each line and check for each key but it's quite messy when having 10+ keys. For example:
if line.startswith("LastName"):
#split line at space and handle
if line.startswith("Color"):
#split line at space and handle
Hoping for something a little cleaner
tokens = ['LastName', 'Color']
dictResult = {}
with open(fileName,'r') as fileHandle:
for line in fileHandle:
lineParts = line.split(" ")
if len(lineParts) == 2 and lineParts[0] in tokens:
dictResult[lineParts[0]] = lineParts[1]
Assuming your file is in something called sampletxt.txt, this would work. It creates a dictionary mapping from key -> list of values.
import re
with open('sampletxt.txt', 'r') as f:
txt = f.read()
keys = ['FirstName', 'LastName', 'Color']
d = {}
for key in keys:
d[key] = re.findall(key+r'\s(.*)\s*\n*', txt)
This version allows you to optionally specify the tokens
import re
​
s = """--START--
FirstName Kitty
LastName McCat
Color Red
random_data
Meow Meow
--END--"""
tokens = ["LastName", "Color"]
if len(tokens) == 0:
print(re.findall("({0}) ({0})".format("\w+"), s))
else:
print( list((t, re.findall("{} (\w+)".format(t), s)[0]) for t in tokens))
Output
[('LastName', 'McCat'), ('Color', 'Red')]
Building off the other answers, this function would use regular expressions to take any text key and return the value if found:
import re
file_name = 'test.txt'
def get_text_value(text_key, file_name):
match_str = text_key + "\s(\w+)\n"
with open(file_name, "r") as f:
text_to_check = f.readlines()
text_value = None
for line in text_to_check:
matched = re.match(match_str, line)
if matched:
text_value = matched.group(1)
return text_value
if __name__ == "__main__":
first_key = "FirstName"
first_value = get_text_value(first_key, file_name)
print('Check for first key "{}" and value "{}"'.format(first_key,
first_value))
second_key = "Color"
second_value = get_text_value(second_key, file_name)
print('Check for first key "{}" and value "{}"'.format(second_key,
second_value))

Categories