Printing parsed data from a csv file to a new file - python

I am using Python to parse data from the following csv file -
{::[name]str1_str2_str3[0]},1,U0.00 - Sensor1 Not Ready\nTry Again,1,0,12
{::[name]str1_str2_str3[0]},2,U0.01 - Sensor2 Not Ready\nTry Again,1,0,12
{::[name]str1_str2_str3[0]},3,U0.02 - \n,1,0,12
{::[name]str1_str2_str3[0]},4,U0.03 - Sensor4 Not Ready\nTry Again,1,0,12
{::[name]str1_str2_str3[0]},5,U0.04 - \n,1,0,12
From the column1, I am parsing the value 0 within the [ ]. Then the value in column2 and from column3, I am parsing the substring "Sensor1 Not Ready" and then printing to another file as follows -
SENSOR1_NOT_READY 0,1
SENSOR2_NOT_READY 0,2
and so on...
Now when I print the parsed values I get the following -
SENSOR1_NOT_READY 0,1
SENSOR2_NOT_READY 0,2
SENSOR2_NOT_READY 0,3
SENSOR4_NOT_READY 0,4
SENSOR4_NOT_READY 0,5
I want to skip printing the lines with no data in column3 (for example - lines 3 and 5 in the csv file). How should I do that?
Expected output -
SENSOR1_NOT_READY 0,1
SENSOR2_NOT_READY 0,2
SENSOR4_NOT_READY 0,4
Following is my Python script -
with open('filename.csv','rb') as csvfile:
reader = csv.reader(csvfile)
for row in reader:
tag_name = row[0]
bit_num = row[1]
error_name = row[2]
# Regular expressions
term0 = '\[(\d)\].*'
term1 = '(\d+)'
term2 = r'.*-\s([\w\s]+)\\n'
capture0 = list(re.search(term0, tag_name).groups())
capture1 = list(re.search(term1, bit_num).groups())
temp = re.search(term2, error_name)
if temp:
result = list(temp.groups())
else:
None
result[-1] = '_'.join(result[-1].split()).upper()
capture2 = ','.join(result)
tp = (capture0[0], capture1[0], capture2) # Tuple
f.write('{2} {0},{1},\n'.format(tp[0], tp[1], tp[2]))

Build a regex that searches for 'normal' lines. Maybe something like "^U0.0[1-5] - \n$"? Then use something like if not re.search(x): before you print the error.
with open('filename.csv','rb') as csvfile:
reader = csv.reader(csvfile)
for row in reader:
tag_name = row[0]
bit_num = row[1]
error_name = row[2]
# Regular expressions
term0 = '\[(\d)\].*'
term1 = '(\d+)'
term2 = r'.*-\s([\w\s]+)\\n'
term3 = '^U0.0[1-5] - \n$'
capture0 = list(re.search(term0, tag_name).groups())
capture1 = list(re.search(term1, bit_num).groups())
temp = re.search(term2, error_name)
if temp:
result = list(temp.groups())
else:
None
result[-1] = '_'.join(result[-1].split()).upper()
capture2 = ','.join(result)
tp = (capture0[0], capture1[0], capture2) # Tuple
if not re.search(temp3,error_name):
f.write('{2} {0},{1},\n'.format(tp[0], tp[1], tp[2])) #I assume this is the print line?

Related

How to pick values for specific times in a date (large list of date, time, value)

I have a file with these columns: date, times, and value of a stock. Basically, per-minute value of stocks. I would like to calculate the difference in the value of a stock at 10 AM and 4 PM. This is the code I have so far:
fileName = "C:\\...\\US_200901_210907.csv"
with open(fileName) as f:
for line in f.readlines()[1:]:
split = line.split(";")
time = split[3]
date = split[2]
for timev in f.readlines()[1:]:
if timev == '100000':
Spot = float(split[2])
elif timev == '160000':
Close = float(split[2])
Diff = Spot - Close
print(Diff)
I am not sure if I am doing this right. But the code needs to cycle/loop through each date first, find the value of the stock at '100000' and '160000' and then calculate the difference between the two. Then move to the next day. And at the end of all days, print the differences for each day.
The "Diff = Spot - Close" line also gives me an error, says "NameError: name 'Spot' is not defined"
Any help is appreciated.
Dataset looks like this (extract):
====================
After working more on this on my own, I was able to get this to work:
import csv
filename = "C:\\...\\US_200901_210907.csv"
with open(filename, 'r') as f:
reader = csv.reader(f, delimiter=';')
next(reader, None) # skip header
rows = list(reader)
listOfDates = []
index = 0
for row in rows:
if rows[index][2] not in listOfDates:
listOfDates.append(rows[index][2])
index = index + 1
print(listOfDates)
startPrice = 0
endPrice = 0
index = 0
startPriceSet = False
endPriceSet = False
for date in listOfDates:
for row in rows:
if rows[index][2] == date:
# print(rows[index][2])
# print(date)
if rows[index][3] == '100000':
startPrice = float(rows[index][7])
startPriceSet = True
elif rows[index][3] == '160000':
endPrice = float(rows[index][7])
endPriceSet = True
index = index + 1
if startPriceSet and endPriceSet:
print(date, startPrice, endPrice, startPrice - endPrice)
startPriceSet = False
endPriceSet = False
Why not leverage a pandas DataFrame for this calculation -
import pandas as pd
df = pd.read_csv("C:\\...\\US_200901_210907.csv")
# give appropriate column names before or after loading the data
# assuming we have the columns 'time', 'date' & 'stockvalue' in df
# might have to use pandas.to_datetime
print(df[(df['time']=='time1') && (df['date']=='date1')]['stockvalue']-df[(df['time']=='time2') && (df['date']=='date1')]['stockvalue'])
Also, why do you have an embedded for loop?
One of the approach with the sheet you have provided:
import pandas as pd
from collections import defaultdict
df = pd.read_excel("Data.xlsx", header=None, dtype='str')
out = defaultdict(lambda: defaultdict(float))
for rowindex, row in df.iterrows():
date = row[2]
name = row[0]
if row[3] == "100000":
out[name]['DATE'] = row[2]
out[name]['START'] = float(row[4])
if row[3] == "160000":
out[name]['END'] = float(row[4])
for stock, data in out.items():
print (stock+': DATE: '+data['DATE']+' START: '+data['START']+' END:'+data['END']+' diff = '+str(int(data['END']-data['START'])))

Python read XML file (near 50mb)

I'm parsing a XML String into CSV string but it's going very slow:
INDEX_COLUMN = "{urn:schemas-microsoft-com:office:spreadsheet}Index"
CELL_ELEMENT = "Cell"
DATA_ELEMENT = "Data"
def parse_to_csv_string(xml):
print('parse_to_csv_string')
csv = []
parsed_data = serialize_xml(xml)
rows = list(parsed_data[1][0])
header = get_cells_text(rows[0])
rows.pop(0)
csv.append(join(",", header))
for row in rows:
values = get_cells_text(row)
csv.append(join(",", values))
return join("\n", csv)
def serialize_xml(xml):
return ET.fromstring(xml)
def get_cells_text(row):
keys = []
cells = normalize_row_cells(row)
for elm in cells:
keys.append(elm[0].text or "")
while len(keys) < 92:
keys.append("")
return keys
def normalize_row_cells(row):
cells = list(row)
updated_cells = copy.deepcopy(cells)
pos = 1
for elm in cells:
strIndexAttr = elm.get(INDEX_COLUMN)
index = int(strIndexAttr) if strIndexAttr else pos
while index > pos:
empty_elm = ET.Element(CELL_ELEMENT)
child = ET.SubElement(empty_elm, DATA_ELEMENT)
child.text = ""
updated_cells.insert(pos - 1, empty_elm)
pos += 1
pos += 1
return updated_cells
The XML String sometimes miss a few columns and I need to iterate it to fill missing columns - every row must have 92 columns. That's why I have some helper functions to manipulate XML.
Right now I'm running my function with 4GB as Lambda and still getting timeout :(
Any idea on how to improve performance?
The normalize_row_cells constructs ElementTree Element instances but get_cells_text is only interested in each instance's child's text attribute, so I would consider changing normalize_row_cells to just return the text. Also, it's performing copies and calling list.insert: inserting elements into the middle of lists can be expensive, because each element after the insertion point must be moved.
Something like this (untested code) avoids making copies and insertions and returns only the required text, making get_cells_text redundant.
def normalize_row_cells(row):
cells = list(row)
updated_cells = []
pos = 1
for _ in range(0, 92):
elm = cells[pos - 1]
strIndexAttr = elm.get(INDEX_COLUMN)
index = int(strIndexAttr) if strIndexAttr else pos
if index == pos:
updated_cells.append(elm[0].text)
pos += 1
else:
update_cells.append("")
return updated_cells
If you can match your cells to their header names then using csv.DictWriter from the standard library might be even better (you need to profile to be sure).
import csv
import io
def parse_to_csv_string(xml):
print('parse_to_csv_string')
csv = []
parsed_data = serialize_xml(xml)
rows = list(parsed_data[1][0])
header = get_cells_text(rows[0])
with io.StringIO() as f:
writer = csv.DictWriter(f, fieldnames=header)
for row in rows:
row = get_cells_text(row)
writer.writerow(row)
f.seek(0)
data = f.read()
return data
def get_cells_text(row):
row_dict = {}
for cell in row:
column_name = get_column_name(cell) # <- can this be done?
row_dict[column_name] = elm[0].text or ""
return row_dict

I can't figure out why I get a blank output file

import csv
import requests
import re
from bs4 import BeautifulSoup
import sys
reload(sys)
sys.setdefaultencoding('utf8')
#CREATE CSV FILE
outfile = open("./output.csv", "wb")
writer = csv.writer(outfile)
#IMPORT MATCHES
import csv
with open('matches.csv', 'rb') as f:
reader = csv.reader(f)
matches = list(reader)
for id in matches:
id = str(id)
id = re.sub("[^0-9]","",id)
url = 'http://www.virtualpronetwork.com/apps/fvpaa/matches/match_report/' + id
print (url)
response = requests.get(url)
html = response.content
soup = BeautifulSoup(html)
#GET TEAMS AND SCORES
score = soup.findAll("div",{"class":"col-md-5 center"})
team_home = score[0]
team_home = str(team_home)
team_home = re.search('title="(.*)" />',team_home)
team_home = team_home.group(1)
team_away = score[1]
team_away = str(team_away)
team_away = re.search('title="(.*)" />',team_away)
team_away = team_away.group(1)
goals_home = score[2]
goals_home = str(goals_home)
goals_home = re.sub('</h2></div>','',goals_home)
goals_home = re.sub('<div class="col-md-5 center"><h2>','',goals_home)
goals_away = score[3]
goals_away = str(goals_away)
goals_away = re.sub('</h2></div>','',goals_away)
goals_away = re.sub('<div class="col-md-5 center"><h2>','',goals_away)
#GET HOME STATS
tables = soup.findChildren('table')
stats_home = tables[0]
list_of_rows_home = []
for row in stats_home.findChildren('tr')[1:]:
list_of_cells = []
for cell in row.findChildren('td')[0]:
text = cell.text
list_of_cells.append(text)
for cell in row.findChildren('td')[1]:
text = cell.text
list_of_cells.append(text)
for cell in row.findChildren('td')[2:]:
list_of_cells.append(cell)
list_of_rows_home.append(list_of_cells)
for i in range(len(list_of_rows_home)):
row = list_of_rows_home[i]
cell = list_of_rows_home[i][2]
cell = str(cell)
goal = re.findall('goal',cell)
goal = goal.count('goal')
goal = goal / 2
assist = re.findall('assist',cell)
assist = assist.count('assist')
assist = assist / 2
motm = re.findall('motm',cell)
motm = motm.count('motm')
row.append(goal)
row.append(assist)
row.append(motm)
for row in list_of_rows_home:
del row[2]
for i in range(len(list_of_rows_home)):
row = list_of_rows_home[i]
row.append(team_home)
row.append(goals_home)
row.append(team_away)
row.append(goals_away)
#GET AWAY STATS
stats_away = tables[1]
list_of_rows_away = []
for row in stats_away.findChildren('tr')[1:]:
list_of_cells = []
for cell in row.findChildren('td')[0]:
text = cell.text
list_of_cells.append(text)
for cell in row.findChildren('td')[1]:
text = cell.text
list_of_cells.append(text)
for cell in row.findChildren('td')[2:]:
list_of_cells.append(cell)
list_of_rows_away.append(list_of_cells)
for i in range(len(list_of_rows_away)):
row = list_of_rows_away[i]
cell = list_of_rows_away[i][2]
cell = str(cell)
goal = re.findall('goal',cell)
goal = goal.count('goal')
goal = goal / 2
assist = re.findall('assist',cell)
assist = assist.count('assist')
assist = assist / 2
motm = re.findall('motm',cell)
motm = motm.count('motm')
row.append(goal)
row.append(assist)
row.append(motm)
for row in list_of_rows_away:
del row[2]
for i in range(len(list_of_rows_away)):
row = list_of_rows_away[i]
row.append(team_away)
row.append(goals_away)
row.append(team_home)
row.append(goals_home)
#COMPILE INTO ONE TABLE
list_of_rows = list_of_rows_home + list_of_rows_away
#WRITE TO CSV
writer.writerows(list_of_rows)
My input file is a basic excel file with the match id's all lined up in column one of the excel file. When it creates the output file, it's blank. I am not getting any error messages either.
The issue is in your regex search, so perhaps change it to:
team_home = re.search('title="(.*)"',team_home)
team_home = team_home.group(1)
Alternative:
team_home = re.search('title="(.*)"/>',team_home)
team_home = team_home.group(1)
The /> is not needed, and this essentially makes title="" not match for group(1), which in turn creates an Attribute Error, and the script stops. If you want to include /> then remove the space in your regex pattern, since that is ultimately what kills it.

Returning a row that matches specified condition, and edit particular columns in row. Then write to csv file with changed row

I'm writing a python script that works with two csv files. Lets call them csv1.csv (original file to read) and csv2.csv (exact copy of csv1). The goal is to find the row and column in the csv file that corresponds to the the modified user-defined input.
csv format:(continues for about 2-3 thousand lines)
record LNLIM, ID_CO,OD_DV,ID_LN, ST_LN, ZST_LN, ID_LNLIM,LIMIT1_LNLIM, LIMIT2_LNLIM, LIMIT3_LNLIM
LNLIM, 'FPL', 'SOUT', '137TH_LEVEE_B', 'B', '137TH_AV', 'LEVEE', 'A', 1000, 1100, 1200
LNLIM, 'FPL', 'SOUT', '137TH_DAVIS_B', 'A', '137TH_AV', 'NEWTON', 'A', 1000, 1100, 1200
...
Let's say that the user is looking for 137TH_AV and NEWTON. I want to be able to go row by row and compare the two columns/row indices ST_LN and ZST_LN. If both columns match what the user inputted then I want to capture which row in the csv file that happened on, and use that information to edit the remaining columns LIMIT1_LNLIM LIMIT2_LNLIM LIMIT3_LNLIM on that row with new analog values.
I want to get the 3 new values provided by the user and edit a specific row, and a specific row element. Once I've found the place to replace the number values I want to overwrite csv2.csv with this edit.
Determining where the line segment is located in the array
import sys
import csv
import os
import shutil
LineSectionNames = []
ScadaNames = []
with open('Vulcan_Imp_Summary.csv', 'r') as file:
reader = csv.reader(file)
for row in reader:
LineSectionName = row[1]
ScadaName = row[29]
LineSectionNames.append(LineSectionName)
ScadaNames.append(ScadaName)
#Reformatting arrays for accurate references
LineSectionNames = [character.replace('\xa0', ' ') for character in LineSectionNames]
LineSectionNames = [character.replace('?', '-') for character in LineSectionNames]
ScadaNames = [character.replace('\xa0', ' ') for character in ScadaNames]
#Setting Line Section name as key and Scada name as value
ScadaDict = {}
for i in range(len(LineSectionNames)):
ScadaDict[LineSectionNames[i]] = ScadaNames[i]
#Prompt user for grammatical name of Line Section
print ('Enter the Line Section Name: (Example = Goulds-Princeton) \n')
user_input = input()
#Reference user input to dictionary value to convert input into SCADA format
def reformat():
print ('Searching for Line Section...' + user_input)
if user_input in ScadaDict:
value = ScadaDict[user_input]
print ('\n\t Match!\n')
else:
print ('The Line Section name you have entered was incorrect. Try again. \n Example = Goulds-Princeton')
reformat()
# Copying the exported file from Genesys
path = 'I://PSCO//DBGROUP//PatrickL//'
shutil.copyfile(path + 'lnlim_import.csv', path + 'lnlim_import_c.csv')
#Using the SCADA format to search through csv file
print ('Searching csv file for...' + user_input)
# Reading the copied file
record_lnlims = []
id_cos = []
id_dvs = []
id_lines = []
id_lns = []
st_lns = []
zst_lns = []
id_lnlims = []
limit1_lnlims = []
limit2_lnlims = []
limit3_lnlims = []
with open('lnlim_import_c.csv', 'r') as copy:
reader = csv.reader(copy)
for row in reader:
record_lnlim = row[0]
id_co = row[1]
id_dv = row[2]
id_line = row[3]
id_ln = row[4]
st_ln = row[5]
zst_ln = row[6]
id_lnlim = row[7]
limit1_lnlim = row[8]
limit2_lnlim = row[9]
limit3_lnlim = row[10]
record_lnlims.append(record_lnlim)
id_cos.append(id_co)
id_dvs.append(id_dv)
id_lines.append(id_line)
id_lns.append(id_ln)
st_lns.append(st_ln)
zst_lns.append(zst_ln)
id_lnlims.append(id_lnlim)
limit1_lnlims.append(limit1_lnlim)
limit2_lnlims.append(limit2_lnlim)
limit3_lnlims.append(limit3_lnlim)
#Reformatting the user input from GOULDS-PRINCETON to 'GOULDS' and 'PRINCETON'
input_split = user_input.split('-', 1)
st_ln1 = input_split[0]
zst_ln1 = input_split[1]
st_ln2 = st_ln1.upper()
zst_ln2 = zst_ln1.upper()
st_ln3 = "'" + str(st_ln2) + "'"
zst_ln3 = "'" + str(zst_ln2) + "'"
#Receiving analog values from user
print ('\n\t Found! \n')
print ('Enter the Specified Emergency Rating (A) for 110% for 7 minutes: ')
limit1_input = input()
print ('Enter the Specified Emergency Rating (A) for 120% for 7 minutes: ')
limit2_input = input()
print ('Enter the Specified Emergency Rating (A) for 130% for 5 minutes: ')
limit3_input = input()
Whenever I print the row_index it prints the initialized value of 0.
i = 0
row_index = 0
for i in range(len(st_lns)):
if st_ln3 == st_lns[i] and zst_ln3 == zst_lns[i]:
row_index = i
print(row_index)
limit1_input = limit1_lnlims[row_index]
limit2_input = limit2_lnlims[row_index]
limit3_input = limit3_lnlims[row_index]
csv_list = []
csv_list.append(record_lnlims)
csv_list.append(id_cos)
csv_list.append(id_dvs)
csv_list.append(id_lines)
csv_list.append(st_lns)
csv_list.append(zst_lns)
csv_list.append(id_lnlims)
csv_list.append(limit1_lnlims)
csv_list.append(limit2_lnlims)
csv_list.append(limit3_lnlims)
#Editing the csv file copy to implement new analog values
with open('lnlim_import_c.csv', 'w') as edit:
for x in zip(csv_list):
edit.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\t{8}\t{9}\t{10}\n".format(x))

Writing values CSV file

I am trying to write some output to csv from my code below. First column should have all of the valid IDs with a header that says “Valid (count in parenthesis)”. The second column should contain a list of all of the non-valid IDs and have a header that says “Non-valid (count in parenthesis)”. Any idea how I do this?
import csv
# csv_path = r'C:\temp\data\fileA'
csv_path = r'C:\temp\data\fileA'
reader = csv.reader(open(csv_path, 'r'), dialect='excel-tab')
reader.next() # ignore heading
min_id = 1503332138
max_id = 1503632138
valid_ids = []
invalid = []
x = 0
for line in reader:
pv = line[1]
if id.isdigit() and int(id) >= min_id and int(id) <= max_id:
if id not in valid_ids:
valid_ids.append(id)
else:
if id not in invalid:
invalid.append(id)
print 'Valid IDs (',len(valid_ids),')'
for valid in valid_ids:
print valid
print 'Invalid IDs (',len(invalid),')'
for invalid in invalid:
print invalid
# ...
# Continuing from point where you have valid_ids and invalid lists populated
data = [('Valid IDs', valid_ids), ('Invalid IDs', invalid)]
# Create header
header = []
for (label, id_list) in data:
label_with_count = '%s (%d)' % (label, len(id_list))
header.append(label_with_count)
# Write to CSV file
with open('path_to_output_file.csv') as out_csv_file:
csv_writer = csv.writer(out_csv_file)
csv_writer.writerow(header)
for (idx, dataset) in enumerate(data):
(label, id_list) = dataset
for id in id_list:
row = (idx * ['']) + [id] + ((len(data) - idx - 1) * [''])
csv_writer.writerow(row)

Categories