I have a loop that searches keywords in a text file and pastes integers that follow the keywords into an excel file. I want the integers to be in specific cells in the excel file. Is it possible to increment i by the values in i_list rather than always 5 like in the example?
i_list = [5,3,1,1]
def search(file, excel_file):
i = 2
found_keywords = list()
wb = load_workbook(excel_file)
sheets = wb.sheetnames
sheet1 = wb[sheets[0]]
for kwrd in keywords:
for line in file:
if kwrd in line and kwrd not in found_keywords:
found_keywords.append(kwrd)
sheet1.cell(row=3, column=i).value = int(re.search(r"\d+", line).group())
i += 5
elif kwrd in line:
continue
wb.save(excel_file)
If you dont need to cycle through the values of i then you can just create a generator to return you the values of i one by one. I have wrapped the calling of the next value of i in a try block since once you run out of values the code wouldnt know what to do. so we break the loop
i_list = (i for i in [5,3,1,1])
i = 2
for _ in range(10):
print(i)
try:
i += next(i_list)
except StopIteration as si:
print("no more values in i so loop terminating")
break
OUTPUT
2
7
10
11
12
no more values in i so loop terminating
However if you want to cycle thorugh the values of i you can use cycle from itertools module and infintly take the next item from i_list for as long as you need
from itertools import cycle
i_list = cycle([5,3,1,1])
i = 2
for _ in range(10):
print(i)
i += next(i_list)
OUTPUT
2
7
10
11
12
17
20
21
22
27
UPDATE OF YOUR CODE
below is an update of your code based on the fact you said you dont have to cycle. Remember that once you reach the end of i_list your code will not be able to increase i since there are no more values in i_list.
i_list = [5,3,1,1]
i_generator = (i for i in i_list)
def search(file, excel_file):
i = 2
found_keywords = list()
wb = load_workbook(excel_file)
sheets = wb.sheetnames
sheet1 = wb[sheets[0]]
for kwrd in keywords:
for line in file:
if kwrd in line and kwrd not in found_keywords:
found_keywords.append(kwrd)
sheet1.cell(row=3, column=i).value = int(re.search(r"\d+", line).group())
i += next(i_generator)
elif kwrd in line:
continue
wb.save(excel_file)
This code snippet, adjusted from the original, cycles through the values of i_list:
i_list = [5,3,1,1]
def search(file, excel_file):
i = 2
found_keywords = list()
wb = load_workbook(excel_file)
sheets = wb.sheetnames
sheet1 = wb[sheets[0]]
for kwrd in keywords:
for line in file:
if kwrd in line and kwrd not in found_keywords:
for i in i_list: # Update i based on the i-list value
sheet1.cell(row=3, column=i).value = int(re.search(r"\d+", line).group())
elif kwrd in line:
continue
wb.save(excel_file)
Related
import pdfplumber
import openpyxl
pdf = pdfplumber.open("path")
page = pdf.pages [0]
text = page.extract_text()
lin = text.split("\n")
wb = openpyxl.load_workbook ("path")
ws= wb.active
ws.title = "S1"
u = int(0)
i = int(1)
for u in lin:
ws.cell(row = i, column =1).value = lin [u]
u = u+1
i = i+1
wb.save ("path")
pdf.close
print("Ok!")
Error:
TypeError: list indices must be integers or slices, not str
The error occurs on for.
I split each line of the pdf and now I want to write each line in excel.
Example:
line in specific pdf file:
A
B
C
D
I got every line value from the pdf. The variable "lin" for example, has the value A in line 0, the value B in line 1.
I want to take the value of lin 0 and write it in cell A1, and then in the same column take the value of lin 1 and write it in cell A2 in excel and so on.
You should use an integer variable u as the index.
u = 0
i = 1
for u in range(len(lin)):
ws.cell(row=i, column=1).value = lin[u]
u += 1
i += 1
You can also replace the for loop with a for-each loop:
for value in lin:
ws.cell(row=i, column=1).value = value
i += 1
In this case you don't need to worry about the index variable.
pdf.close()
So let's say I have this txt file formatted (value)(space)(value) and there's a second set of numbers separated with a (tab). An example file is given here:
Header
5 5 6 7 8 7 8 9 0 1
7 6 3 4 1 1 3 6 8 1
8 7 4 1 3 1 9 8 5 1
Now I'm using this code to print all the values shown in the txt file:
NEWLINE = "\n"
def readBoardFromFile():
inputFileOK = False
aBoard = []
while (inputFileOK == False):
try:
inputFileName = input("Enter the name of your file: ")
inputFile = open(inputFileName, "r")
print("Opening File " + inputFileName + "for reading")
currentRow = 0
next(inputFile)
for line in inputFile:
aBoard.append([])
for ch in line:
if (ch != NEWLINE):
aBoard[currentRow].append(ch)
currentRow = currentRow + 1
inputFileOK = True
print("Completed reading of file " + inputFileName)
except IOError:
print("Error: File couldn't be opened")
numRows = len(aBoard)
numColumns = len(aBoard[0])
return(aBoard,numRows,numColumns)
def display(aBoard, numRows, numColumns):
currentRow = 0
currentColumn = 0
print("DISPLAY")
while (currentRow < numRows):
currentColumn = 0
while (currentColumn < numColumns):
print("%s" %(aBoard[currentRow][currentColumn]), end="")
currentColumn = currentColumn + 1
currentRow = currentRow + 1
print()
for currentColumn in range (0,numColumns,1):
print("*", end ="")
print(NEWLINE)
def start():
aBoard,numRows,numColumns = readBoardFromFile()
display(aBoard,numRows,numColumns)
start()
Normally when I run this code this is the output:
DISPLAY
5 5 6 7 8 7 8 9 0 1
7 6 3 4 1 1 3 6 8 1
8 7 4 1 3 1 9 8 5 1
*******************
How do I make it so that the output is:
DISPLAY
5 5 6 7 8
7 6 3 4 1
8 7 4 1 3
Only displaying the numbers in the left half?
Perhaps you can try using the csv module and open the file with tab delimiter.
Then assuming you go with the list approach you could only print the first element from each row.
Something like:
import csv
with open("a.txt") as my_file:
reader = csv.reader(my_file, delimiter ='\t')
next(reader) # to skip header if exists
for line in reader:
print(line[0])
From what I can see you aren't taking the tab character into account in your code which is probably why you have the additional characters in your output.
Here's the approach I would take, utilising Python's power when it comes to processing strings.
I would encourage you to use this type of approach when writing Python, as it will make your like much easier.
NEWLINE = "\n"
def read_board_from_file():
input_file_OK = False
a_board = []
while not input_file_OK:
try:
input_file_name = input("Enter the name of your file: ")
with open(input_file_name, "r") as input_file:
# A file-like object (as returned by open())
# can be simply iterated
for line in input_file:
# Skip header line if present
# (not sure how you would want to handle the
# header.
if "header" in line.lower():
continue
# Strip NEWLINE from each line,
# then split the line at the tab character.
# See comment above.
parts = line.strip(NEWLINE).split("\t")
# parts is a list, we are only interested
# in the first bit.
first_part = parts[0]
# Split the left part at all whitespaces.
# I'm assuming that this is what you want.
# A more complex treatment might make sense here,
# depending on your use-case.
entries = first_part.split()
# entries is a list, so we just need to append it
# to the board
a_board.append(entries)
input_file_OK = True
print(f"Completed reading of file {input_file_name}")
except IOError:
print("Error: File {input_file_name} couldn't be opened.")
return a_board
def display_board(a_board):
print("DISPLAY")
longest_row = 0
# a_board is a list of lists,
# no need to keep track of the number of rows and columns,
# we can just iterate it.
for row in a_board:
# row is a list of entries, we can use str.join() to add a space
# between the parts and format the row nicely.
row_str = " ".join(row)
# At the same time we can keep track of the longest row
# for printing the footer later.
len_row_str = len(row_str)
if len_row_str > longest_row:
longest_row = len_row_str
print(row_str)
# The footer is simply the asterisk character
# printed as many times as the longest row.
footer = "*" * longest_row
print(footer, end="")
print(NEWLINE)
def start():
a_board = read_board_from_file()
display_board(a_board)
start()
I'd do this with a bit more separation between inputs, outputs, and data processing. Your input is the name of the file, and a secondary input is the actual file contents. The data processing step is taking the file contents, and returning some internal representation of a collection of boards. The output is displaying the first board.
from typing import Iterable, List
def parse(lines: Iterable[str], board_sep: str = "\t") -> List[List[List[str]]]:
boards = []
for i, line in enumerate(lines):
# list of the same line from each board
board_lines = line.split(board_sep)
if i == 0:
# this only happens once at the start
# each board can be a list of lists. so boards is a list of "list of lists"
# we're going to append lines to each board, so we need some initial setup
# of boards
boards = [[] for _ in range(len(board_lines))]
for board_idx, board_line in enumerate(board_lines):
# then just add each line of each board to the corresponding section
boards[board_idx].append(board_line.split())
return boards
def show_board(board: List[List[str]]) -> None:
for row in board:
print(" ".join(row))
Now we can put that all together. We need to:
Get the filename
Open the file
Filter out the "Header" and any blank lines
Pass the rest of the lines to the parse() function
Get the first board
Print it with the context
from typing import Tuple
def get_board_dimensions(board: List[List[str]]) -> Tuple[int, int]:
""" Returns a tuple of (rows, cols) """
return len(board), len(board[0])
def get_filtered_file(filename: str) -> Iterable[str]:
with open(filename) as f:
for line in f:
if not line or line.lower() == "header":
continue
yield line
def main():
filename = input("Enter the name of your file: ")
filtered_lines = get_filtered_file(filename)
boards = parse(filtered_lines)
# now we can show the first one
b = boards[0]
_, cols = get_board_dimensions(b)
print("DISPLAY")
show_board(b)
print("*" * (2 * cols - 1)) # columns plus gaps
I would like to extract key and value from an existing text file. Key in a separate variable and value in a separate variable.
The text file (sample.txt) contains the below content,
one:two
three:four
five:six
seven:eight
nine:ten
sample:demo
I am able to read the content from the text file, but i am not able to proceed further to extract key and value.
with open ("sampletxt.txt", "r") as hfile:
sp = hfile.read()
print (sp)
x=0
for line in sp:
sp.split(":")[x].strip()
x+=1
The above only extracts the value and also provides index out of range exception at the end.
If we iterate through the file, i am expecting the output as below,
Key 0 = one
Key 1 = three
Key 2 = five
Key 3 = seven
key 4 = sample
Value 0 = two
Value 1 = four
Value 2 = six
Value 3 = eight
Value 4 = ten
This should work:
with open ("sampletxt.txt", "r") as hfile:
sp = hfile.read()
print (sp)
lines = sp.split("\n")
for line in lines:
# print("line:[{0}]".format(line))
parts = line.split(":")
print("key:[{0}], value:[{1}]".format(parts[0], parts[1]))
It can work:
sp = open ("sampletxt.txt", "r")
x=0
key=[]
value=[]
try:
while True:
text_line = sp.readline()
if text_line:
text_line = ''.join(text_line)
text_line = text_line.split()
text_line = ''.join(text_line).split(':')
key.append(text_line[0])
value.append(text_line[1])
x += 1
else:
for i in range(x):
print("Key {} = {}".format(i,key[i]))
print("")
for i in range(x):
print("Value {} = {}".format(i,value[i]))
break
finally:
sp.close()
The output is:
Key 0 = one
Key 1 = three
Key 2 = five
Key 3 = seven
Key 4 = nine
Key 5 = sample
Value 0 = two
Value 1 = four
Value 2 = six
Value 3 = eight
Value 4 = ten
Value 5 = demo
which is similar to your request
Why don't you try:
with open ("sampletxt.txt", "r") as hfile:
sp = hfile.read()
print (sp)
dictionary = {}
for x, line in enumerate(sp):
line_list = sp.split(":")
dictionary[line_list[0]]=line_list[1]
You should always check if split returns two members (or any number you expect) before using the indexes.
I am trying to scan a csv file and make adjustments line by line. In the end, I would like to remove the last line. How can I remove the last line within the same scanning loop?
My code below reads from the original file, makes adjustments and finally writes to a new file.
import csv
raw_data = csv.reader(open("original_data.csv", "r"), delimiter=",")
output_data = csv.writer(open("final_data.csv", "w"), delimiter=",")
lastline = # integer index of last line
for i, row in enumerate(raw_data):
if i == 10:
# some operations
output_data.writerow(row)
elif i > 10 and i < lastline:
# some operations
output_data.writerow(row)
elif i == lastline:
output_data.writerow([])
else:
continue
You can make a generator to yield all elements except the last one:
def remove_last_element(iterable):
iterator = iter(iterable)
try:
prev = next(iterator)
while True:
cur = next(iterator)
yield prev
prev = cur
except StopIteration:
return
Then you just wrap raw_data in it:
for i, row in enumerate(remove_last_element(raw_data)):
# your code
The last line will be ignored automatically.
This approach has the benefit of only reading the file once.
A variation of #Kolmar's idea:
def all_but_last(it):
buf = next(it)
for item in it:
yield buf
buf = item
for line in all_but_last(...):
Here's more generic code that extends islice (two-args version) for negative indexes:
import itertools, collections
def islice2(it, stop):
if stop >= 0:
for x in itertools.islice(it, stop):
yield x
else:
d = collections.deque(itertools.islice(it, -stop))
for item in it:
yield d.popleft()
d.append(item)
for x in islice2(xrange(20), -5):
print x,
# 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14
You can iterate with window of size 2 and print only the first value in the window. This will lead to the last element being skipped:
from itertools import izip, tee
def pairwise(iterable):
a, b = itertools.tee(iterable)
next(b, None)
return izip(a, b)
for row, _ in pairwise(raw_data):
output_data.writerow(row)
output_data.writerow([])
An idea is to calculate the length of each line you iterate and then when coming to the last line truncate the file thus "shortening the file". Not sure if this is good practice though...
eg
Python: truncate a file to 100 lines or less
Instead of writing the current line each loop iteration, try writing the previously read line:
import csv
raw_data = csv.reader(open("original_data.csv", "r"), delimiter=",")
output_data = csv.writer(open("final_data.csv", "w"), delimiter=",")
last_iter = (None, None)
try:
last_iter = (0, raw_data.next())
except StopIteration:
# The file is empty
pass
else:
for new_row in raw_data:
i, row = last_iter
last_iter = (i + 1, new_row)
if i == 10:
# some operations
output_data.writerow(row)
elif i > 10:
# some operations
output_data.writerow(row)
# Here, the last row of the file is in the `last_iter` variable.
# It won't get written into the output file.
output_data.writerow([])
I have a code that works fine when I have small CSV's of data but errors out when I try to run large CSV's through it. In essence this code is supposed to place 3 CSV's worth of data into 3 separate dictionaries, combine those dictionaries into a master dictionary, and then preform arithmetic operations on dictionary. The input CSV's look something like this:
time A B C D
0 3 4 6 4
.001 4 6 7 8
.002 4 6 7 3
The code that I am using is the code displayed below. The error occurs within the lines 47 and 65 where I am try to preform arithmetic with the dictionary. Any explanation as to why this is going on is greatly appreciated.
import numpy
Xcoord = {}
time = []
with open ('Nodal_QuardnetsX2.csv', 'r') as f:
f.readline() # Skips first line
for line in f:
values = [s.strip()for s in line.split(',')]
Xcoord[values[0]] = map(float, values[1:])
time.append(values[0])
Ycoord = {}
with open ('Nodal_QuardnetsY2.csv', 'r') as f:
f.readline() # Skips first line
for line in f:
values = [s.strip()for s in line.split(',')]
Ycoord[values[0]] = map(float, values[1:])
Zcoord = {}
with open ('Nodal_QuardnetsZ2.csv', 'r') as f:
f.readline() # Skips first line
for line in f:
values = [s.strip()for s in line.split(',')]
Zcoord[values[0]] = map(float, values[1:])
# Create a master dictionary of the form {'key':[[x, y, z], [x, y, z]}
CoordCombo = {}
for key in Xcoord.keys():
CoordnateList = zip(Xcoord[key], Ycoord[key], Zcoord[key])
CoordCombo[key] = CoordnateList
counter = 0
keycount1 = 0
keycount2 = 0.001
difference = []
NodalDisplacements = {}
#Find the difference between the x, y, and z quardnets relative to that point in time
while keycount2 <= float(values[0]):
Sub = numpy.subtract(CoordCombo[str(keycount2)][counter], CoordCombo[str(keycount1)][counter])
counter = counter + 1
difference.append(Sub)
NodalDisplacements[keycount1] = Sub
keycount1 = keycount1 + 0.001
keycount2 = keycount2 + 0.001
counter = 0
keycount3 = 0
keycount4 = 0.001
Sum = []
breakpoint = float(values[0])-0.001
while keycount4 <= breakpoint:
Add = numpy.sum(NodalDisplacements[keycount4][counter], NodalDisplacements[keycount3][counter])
Sum.append(Add)
keycount3 = keycount3 + 0.001
keycount4 = keycount4 + 0.001
counter = counter + 1
if counter == 2:
counter = 0
print Sum
probably a line of your csv file does not contain 5 elements or the line is empty.
In your logic I would suggest to use
for line in f:
line = line.strip()
if not line: continue
if len(values) != N_COLS: continue # or error...
# other ...