RE Formula to get 2 strings and put into csv - python

I have the following in text files and I need to get a simple csv with the DataSourceName,FileName
Datastructure
<DataSourceDefinitionSet>
<TABFileDataSourceDefinition id="id1" readOnly="false">
<DataSourceName>AirportLayout</DataSourceName>
<FileName>\\GIS\GIS\Corporate Services\Information Services\AirportLayout.TAB</FileName>
</TABFileDataSourceDefinition>
<TABFileDataSourceDefinition id="id2" readOnly="false">
<DataSourceName>Asset_Toilets</DataSourceName>
<FileName>\\gis\gis\CITY WORKS\Infrastructure Management\Asset_Toilets.TAB</FileName>
</TABFileDataSourceDefinition>
<TABFileDataSourceDefinition id="id3" readOnly="false">
<DataSourceName>BaseLayer_Text</DataSourceName>
<FileName>\\GIS\GIS\Corporate Services\Information Services\BaseLayer_Text.TAB</FileName>
</TABFileDataSourceDefinition>
CODE
import re
filename='CRC_Public_Features.mws'
input_file = open(filename)
count=0
for line in input_file:
line = line.rstrip()
if re.search('<FileName>', line) :
line=line.replace('<Filename>','')
count+=1
print str(count)+','+line
OUTPUT
>>>
*** Remote Interpreter Reinitialized ***
>>>
1, <FileName>\\GIS\GIS\Corporate Services\Information Services\AirportLayout.TAB</FileName>
2, <FileName>\\gis\gis\CITY WORKS\Infrastructure Management\Asset_Toilets.TAB</FileName> 3,
I want
1,AirportLayout,\GIS\GIS\Corporate Services\Information
Services\AirportLayout.TAB
etc
I tried the following re but get no result.
'.([^ ]*)'
What can I do? I need the 2 lines for Datasource name and Filename together.
===== FINAL CODE USED based on accepted answer
import re
filename='CRC_Public_Features.mws'
data = open(filename).read()
count=0
#for line in infile:
#data=line
values = [re.findall(first+"(.*?)"+second, data) for first, second in [("<{}>".format(b), "</{}>".format(b)) for b in ["DataSourceName","FileName"]]]
ids = [re.search("\d+", i).group(0) for i in re.findall('id="(.*?)"', data)]
final_values = [ids[0]] + [i[0] for i in values]
DataSourceName=values[0]
FileName=values[1]
total=len(FileName)
with open("Output.csv", "w") as text_file:
text_file.write("ID,DataSourceName,FileName,MWS\n")
for item in FileName:
print str(count+1)+","+str(DataSourceName[count])+","+str(FileName[count])
with open("Output.csv", "a") as text_file:
text_file.write(str(count+1)+","+str(DataSourceName[count])+","+str(FileName[count])+","+str(filename)+"\n")
count+=1

With xml.etree.ElementTree and csv modules:
import xml.etree.ElementTree as ET, csv
tree = ET.parse('CRC_Public_Features.mws')
root = tree.getroot()
with open('result.csv', 'w', newline='') as f:
writer = csv.writer(f, delimiter=',')
for i,ds in enumerate(root.findall('TABFileDataSourceDefinition'), 1):
writer.writerow([i, ds.find('DataSourceName').text, ds.find('FileName').text])
Final result.csv contents:
1,AirportLayout,\\GIS\GIS\Corporate Services\Information Services\AirportLayout.TAB
2,Asset_Toilets,\\gis\gis\CITY WORKS\Infrastructure Management\Asset_Toilets.TAB
3,BaseLayer_Text,\\GIS\GIS\Corporate Services\Information Services\BaseLayer_Text.TAB

You can try this:
import re
filename='CRC_Public_Features.mws'
data = open(filename).read()
values = [re.findall(first+"(.*?)"+second, data) for first, second in [("<{}>".format(b), "</{}>".format(b)) for b in ["DataSourceName","FileName"]]]
ids = [re.search("\d+", i).group(0) for i in re.findall('id="(.*?)"', data)]
final_values = [ids[0]] + [i[0] for i in values]
Output:
['1', 'AirportLayout', '\\GIS\\GIS\\Corporate Services\\Information Services\\AirportLayout.TAB']

Related

Defining a list within the code versus reading it from a file

I am trying to count the number of specific words in a given report. Does anyone know why defining a list within the code makes the second part of the following code run faster than reading the list from a file? Is there a solution? The list contains the same words is a lot longer than two words in the following example.
# Example code: Within code list
import csv
import glob
import re
import time
TARGET_FILES = r'C:/Users/s170760/Desktop/Reports_Cleaned/*.*'
OUTPUT_FILE = r'C:/Users/s170760/Desktop/Parser.csv'
OUTPUT_FIELDS = ['file name', 'create']
create = {'agile', 'skills'}
def main():
f_out = open(OUTPUT_FILE, 'w')
wr = csv.writer(f_out, lineterminator='\n')
wr.writerow(OUTPUT_FIELDS)
file_list = glob.glob(TARGET_FILES)
for file in file_list:
print(file)
with open(file, 'r', encoding='UTF-8', errors='ignore') as f_in:
doc = f_in.read()
doc = doc.lower()
output_data = get_data(doc)
output_data[0] = file
wr.writerow(output_data)
def get_data(doc):
_odata = [0] * 2
tokens = re.findall('\w(?:[-\w]*\w)?', doc)
for token in tokens:
if token in create:
_odata[1] += 1
return _odata
Here is the other way:
# Example code: Reading list from a file
import csv
import glob
import re
import time
TARGET_FILES = r'C:/Users/s170760/Desktop/Reports_Cleaned/*.*'
OUTPUT_FILE = r'C:/Users/s170760/Desktop/Parser.csv'
OUTPUT_FIELDS = ['file name', 'create']
create = open('C:/Users/s170760/Desktop/Create.txt', 'r').read().splitlines()
def main():
f_out = open(OUTPUT_FILE, 'w')
wr = csv.writer(f_out, lineterminator='\n')
wr.writerow(OUTPUT_FIELDS)
file_list = glob.glob(TARGET_FILES)
for file in file_list:
print(file)
with open(file, 'r', encoding='UTF-8', errors='ignore') as f_in:
doc = f_in.read()
doc = doc.lower()
output_data = get_data(doc)
output_data[0] = file
wr.writerow(output_data)
def get_data(doc):
_odata = [0] * 2
tokens = re.findall('\w(?:[-\w]*\w)?', doc)
for token in tokens:
if token in create:
_odata[1] += 1
return _odata
As pointed out by Mark in the comments, the first code snippet uses a set of strings, while the second code snippet loads a file into a list of strings.
Why sets are faster than lists in this use case, is well explained in this Stack Overflow answer. Parsing the output of open to a set can indeed solve your problem.
So replace:
create = open('C:/Users/s170760/Desktop/Create.txt', 'r').read().splitlines()
With:
create = set(open('C:/Users/s170760/Desktop/Create.txt', 'r').read().splitlines())

writing into a csv file with python

heres my little program. at the end i want to write the names and passwords
into csv file like this:
Jack,9978
Sara,1647
but i cant!? my program output is correct but when i write it into csv it goes like:
Jack9978,Sara1674
how will you fix it?
import hashlib
import csv
answer = []
usr_pas = []
with open('...', 'r') as f:
reader = csv.reader(f)
for word in reader:
usr_pas.append(word)
for i in range(999, 10000):
num = str(i)
m = hashlib.sha256()
m.update(num.encode('utf-8'))
hsh = m.hexdigest()
hash_dict = {hsh: num}
for key in list(hash_dict.items()):
for value in usr_pas:
if key[0] == value[1]:
answer.append(value[0] +','+ key[1])
file = open("...", 'w', newline='')
with file:
writer = csv.writer(file)
writer.writerow(i.strip().replace(',', '') for i in answer)
file.close()
what did i wrong!?
Try this (lines with comments are changed):
import hashlib
import csv
answer = []
usr_pas = []
with open('...', 'r') as f:
reader = csv.reader(f)
for word in reader:
usr_pas.append(word)
for i in range(999, 10000):
num = str(i)
m = hashlib.sha256()
m.update(num.encode('utf-8'))
hsh = m.hexdigest()
hash_dict = {hsh: num}
for key in list(hash_dict.items()):
for value in usr_pas:
if key[0] == value[1]:
answer.append(value[0] +','+ key[1] + '\n') #added '\n' at the end
file = open("...", 'w', newline='')
with file:
writer = csv.writer(file)
writer.writerow(i for i in answer) #removed i.replace
file.close()
I guess you want a csv file with multiple lines instead of one. If so, my suggestion is to use csv.csvwriter.writerows instead of csv.csvwriter.writerow. The latter is designed to write a single row. See the official document here. Indeed multiple lines might be created with \n manipulator, it means a single line with multiple elements that contains "new line", which seems awkward.
Since we can use the default delimiter (comma), we just need to manage each element in the line as a tuple (or a list). Answers should be added into list answer like this:
answer.append((value[0], key[1]))
while we write rows in this way:
writer.writerows(answer)
Let's put them together:
import hashlib
import csv
answer = []
usr_pas = []
with open('...', 'r') as f:
reader = csv.reader(f)
for word in reader:
usr_pas.append(word)
for i in range(999, 10000):
num = str(i)
m = hashlib.sha256()
m.update(num.encode('utf-8'))
hsh = m.hexdigest()
hash_dict = {hsh: num}
for key in list(hash_dict.items()):
for value in usr_pas:
if key[0] == value[1]:
# answer.append(value[0] +','+ key[1])
answer.append((value[0], key[1]))
file = open("...", 'w', newline='')
with file:
writer = csv.writer(file)
# writer.writerow(i.strip().replace(',', '') for i in answer)
writer.writerows(answer)
file.close()

Cannot pick and append list elements into a existing csv

I have a list with the following format
Mylist = [['5AEEP1','0','1','LAP1'],['5XXEP1','0','1','LAP2'],['5AXAP1','0','1','LAP3']]
I am trying to get the first and last element and append them into an existing csv
5AEEP1,LAP1
5XXEP1,LAP2
5AXAP1,LAP3
with the following
with open(old_pcodes,"a",encoding='utf-8', newline="") as infile:
writer = csv.writer(infile, delimiter=';',quoting=csv.QUOTE_NONE)
towrite =[]
for ritem in Mylist:
if ritem:
pno = ritem[0]
thepcode = ritem[3]
finalout = pno+';'+thepcode
finalout.strip('"')
writer.writerow([finalout])
I get an escape error
If I add
writer = csv.writer(infile, delimiter=';',quoting=csv.QUOTE_NONE, escapechar=' ')
Then I have in the csv a space
5AEEP1 ,LAP1
5XXEP1 ,LAP2
5AXAP1 ,LAP3
How else can I do it
You can write them with writerows(..) all at once:
import csv
Mylist = [['5AEEP1','0','1','LAP1'],['5XXEP1','0','1','LAP2'],['5AXAP1','0','1','LAP3']]
with open("t.txt","a",encoding='utf-8', newline="") as infile:
writer = csv.writer(infile, delimiter=';',quoting=csv.QUOTE_NONE)
writer.writerows( (i[0],i[3]) for i in Mylist )
with open("t.txt") as f:
print(f.read())
Output:
5AEEP1;LAP1
5XXEP1;LAP2
5AXAP1;LAP3

Python file matching and appending

This is one file result.csv:
M11251TH1230
M11543TH4292
M11435TDS144
This is another file sample.csv:
M11435TDS144,STB#1,Router#1
M11543TH4292,STB#2,Router#1
M11509TD9937,STB#3,Router#1
M11543TH4258,STB#4,Router#1
Can I write a Python program to compare both the files and if line in result.csv matches with the first word in the line in sample.csv, then append 1 else append 0 at every line in sample.csv?
import pandas as pd
d1 = pd.read_csv("1.csv",names=["Type"])
d2 = pd.read_csv("2.csv",names=["Type","Col2","Col3"])
d2["Index"] = 0
for x in d1["Type"] :
d2["Index"][d2["Type"] == x] = 1
d2.to_csv("3.csv",header=False)
Considering "1.csv" and "2.csv" are your csv input files and "3.csv" is the result you needed
The solution using csv.reader and csv.writer (csv module):
import csv
newLines = []
# change the file path to the actual one
with open('./data/result.csv', newline='\n') as csvfile:
data = csv.reader(csvfile)
items = [''.join(line) for line in data]
with open('./data/sample.csv', newline='\n') as csvfile:
data = list(csv.reader(csvfile))
for line in data:
line.append(1 if line[0] in items else 0)
newLines.append(line)
with open('./data/sample.csv', 'w', newline='\n') as csvfile:
writer = csv.writer(csvfile)
writer.writerows(newLines)
The sample.csv contents:
M11435TDS144,STB#1,Router#1,1
M11543TH4292,STB#2,Router#1,1
M11509TD9937,STB#3,Router#1,0
M11543TH4258,STB#4,Router#1,0
With only one column, I wonder why you made it as a result.csv. If it is not going to have any more columns, a simple file read operation would suffice. Along with converting the data from result.csv to dictionary will help in quick run as well.
result_file = "result.csv"
sample_file = "sample.csv"
with open(result_file) as fp:
result_data = fp.read()
result_dict = dict.fromkeys(result_data.split("\n"))
"""
You can change the above logic, in case you have very few fields on csv like this:
result_data = fp.readlines()
result_dict = {}
for result in result_data:
key, other_field = result.split(",", 1)
result_dict[key] = other_field.strip()
"""
#Since sample.csv is a real csv, using csv reader and writer
with open(sample_file, "rb") as fp:
sample_data = csv.reader(fp)
output_data = []
for data in sample_data:
output_data.append("%s,%d" % (data, data[0] in result_dict))
with open(sample_file, "wb") as fp:
data_writer = csv.writer(fp)
data_writer.writerows(output_data)
The following snippet of code will work for you
import csv
with open('result.csv', 'rb') as f:
reader = csv.reader(f)
result_list = []
for row in reader:
result_list.extend(row)
with open('sample.csv', 'rb') as f:
reader = csv.reader(f)
sample_list = []
for row in reader:
if row[0] in result_list:
sample_list.append(row + [1])
else:
sample_list.append(row + [0]
with open('sample.csv', 'wb') as f:
writer = csv.writer(f)
writer.writerows(sample_list)

How to export the result of print into a csv file in Python?

How to export the result of following code into a csv file in python?
print(['%s %0.2f'%(node,centrality[node]) for node in centrality])
The result looks like this:
['faef 0.37', 'efef 0.60', 'vsav 0.60', 'fadf 0.37']
And I want it to be like this in a file.csv:
faef,0.37
efef,0.60
vsav,0.60
fadf,0.37
Thx ahead! I am using python 2.7 in Ubuntu
import csv
data = ['faef 0.37', 'efef 0.60', 'vsav 0.60', 'fadf 0.37']
with open('file.csv', 'w', newline='') as fp:
a = csv.writer(fp, delimiter=',')
complete_data = []
for d in data:
data_to_write = d.split(" ")
complete_data.append(data_to_write)
a.writerows(data)
import csv
result = ['faef 0.37', 'efef 0.60', 'vsav 0.60', 'fadf 0.37']
with open('path/to/file', 'w') as outfilehandle:
outfile = csv.writer(outfilehandle, delimiter=',')
outfile.writelines([s.split() for s in result])
import csv
data = ['faef 0.37', 'efef 0.60', 'vsav 0.60', 'fadf 0.37']
with open('test.csv', 'w') as fp:
writer = csv.writer(fp, delimiter=',')
for x in data:
k, v = x.split(' ')
writer.writerow([k, v])
OUTPUT file
faef,0.37
efef,0.60
vsav,0.60
fadf,0.37

Categories