how to write dynamic lines from pdf to excel - python

import pdfplumber
import openpyxl
pdf = pdfplumber.open("path")
page = pdf.pages [0]
text = page.extract_text()
lin = text.split("\n")
wb = openpyxl.load_workbook ("path")
ws= wb.active
ws.title = "S1"
u = int(0)
i = int(1)
for u in lin:
ws.cell(row = i, column =1).value = lin [u]
u = u+1
i = i+1
wb.save ("path")
pdf.close
print("Ok!")
Error:
TypeError: list indices must be integers or slices, not str
The error occurs on for.
I split each line of the pdf and now I want to write each line in excel.
Example:
line in specific pdf file:
A
B
C
D
I got every line value from the pdf. The variable "lin" for example, has the value A in line 0, the value B in line 1.
I want to take the value of lin 0 and write it in cell A1, and then in the same column take the value of lin 1 and write it in cell A2 in excel and so on.

You should use an integer variable u as the index.
u = 0
i = 1
for u in range(len(lin)):
ws.cell(row=i, column=1).value = lin[u]
u += 1
i += 1
You can also replace the for loop with a for-each loop:
for value in lin:
ws.cell(row=i, column=1).value = value
i += 1
In this case you don't need to worry about the index variable.
pdf.close()

Related

How to transform a csv file into a multi-dimensional list using Python?

I started out with a 4d list, something like
tokens = [[[["a"], ["b"], ["c"]], [["d"]]], [[["e"], ["f"], ["g"]],[["h"], ["i"], ["j"], ["k"], ["l"]]]]
So I converted this to a csv file using the code
import csv
def export_to_csv(tokens):
csv_list = [["A", "B", "C", word]]
for h_index, h in enumerate(tokens):
for i_index, i in enumerate(h):
for j_index, j in enumerate(i):
csv_list.append([h_index, i_index, j_index, j])
with open('TEST.csv', 'w') as f:
# using csv.writer method from CSV package
write = csv.writer(f)
write.writerows(csv_list)
But now I want to do the reverse process, want to convert a csv file obtained in this format, back to the list format mentioned above.
Assuming you wanted your csv file to look something like this (there were a couple typos in the posted code):
A,B,C,word
0,0,0,a
0,0,1,b
0,0,2,c
...
here's one solution:
import csv
def import_from_csv(filename):
retval = []
with open(filename) as fh:
reader = csv.reader(fh)
# discard header row
next(reader)
# process data rows
for (x,y,z,word) in reader:
x = int(x)
y = int(y)
z = int(z)
retval.extend([[[]]] * (x + 1 - len(retval)))
retval[x].extend([[]] * (y + 1 - len(retval[x])))
retval[x][y].extend([0] * (z + 1 - len(retval[x][y])))
retval[x][y][z] = [word]
return retval
def import_from_csv(file):
import ast
import csv
data = []
# Read the CSV file
with open(file) as fp:
reader = csv.reader(fp)
# Skip the first line, which contains the headers
next(reader)
for line in reader:
# Read the first 3 elements of the line
a, b, c = [int(i) for i in line[:3]]
# When we read it back, everything comes in as strings. Use
# `literal_eval` to convert it to a Python list
value = ast.literal_eval(line[3])
# Extend the list to accomodate the new element
data.append([[[]]]) if len(data) < a + 1 else None
data[a].append([[]]) if len(data[a]) < b + 1 else None
data[a][b].append([]) if len(data[a][b]) < c + 1 else None
data[a][b][c] = value
return data
# Test
assert import_from_csv("TEST.csv") == tokens
First, I'd make writing this construction in a CSV format independent from dimensions:
import csv
def deep_iter(seq):
for i, val in enumerate(seq):
if type(val) is list:
for others in deep_iter(val):
yield i, *others
else:
yield i, val
with open('TEST.csv', 'w') as f:
csv.writer(f).writerows(deep_iter(tokens))
Next, we can use the lexicographic order of the indices to recreate the structure. All we have to do is sequentially move deeper into the output list according to the indices of a word. We stop at the penultimate index to get the last list, because the last index is pointing only at the place of the word in this list and doesn't matter due to the natural ordering:
with open('TEST.csv', 'r') as f:
rows = [*csv.reader(f)]
res = []
for r in rows:
index = r[:-2] # skip the last index and word
e = res
while index:
i = int(index.pop(0)) # get next part of a current index
if i < len(e):
e = e[i]
else:
e.append([]) # add new record at this level
e = e[-1]
e.append(r[-1]) # append the word to the corresponding list

Copying same range multiple times from one workbook to another

Per the attached image, I am trying to copy and paste the same data into a different format.
I have figured out the first part of the code but I need help abbreviating the 2nd half after this comment:
"Fills in the concepts per store group step by step"
Currently, this code is not efficient and I would like to have it compressed into just a couple of lines.
Image of desired result (Right hand side):
Here is the code I have cobbled together so far:
import openpyxl as xl;
filename ="c:\\Users\kevin\Documents\Python Programs\Excel Python\Conceptlist.xlsx"
wb1 = xl.load_workbook(filename)
ws1 = wb1.worksheets[0]
# opening the destination excel file
filename1 ="c:\\Users\kevin\Documents\Python Programs\Excel Python\Conceptlist2.xlsx"
wb2 = xl.load_workbook(filename1)
ws2 = wb2.worksheets[0]
# copying the cell values from source
# excel file to destination excel file
rowctsq = ws1['A1']
j = 0
while j < rowctsq.value:
j = j + 3
for i in range (3 , 6):
# reading cell value from source excel file
# Populates the store list repeatedly
c = ws1.cell(row = i, column = 1)
ws2.cell(row =i , column = 1).value = c.value
ws2.cell(row =i + j , column = 1).value = c.value
# Fills in the concepts per store group step by step
c = ws1.cell(row = i, column = 2)
ws2.cell(row =i , column = 3).value = c.value
c = ws1.cell(row = i, column = 3)
ws2.cell(row =i + 3 , column = 3).value = c.value
c = ws1.cell(row = i, column = 4)
ws2.cell(row =i + 6 , column = 3).value = c.value
c = ws1.cell(row = i, column = 5)
ws2.cell(row =i + 9 , column = 3).value = c.value
# saving the destination excel file
wb2.save('c:\\Users\kevin\Documents\Python Programs\Excel Python\Conceptlist2.xlsx')
Hopefully, I get extra community points for answering my own question! I worked through this and have pretty much gotten to my destination. Here's the code I came up with. Works like a charm. :)
import openpyxl as xl;
filename ="c:\\Users\kevin\Documents\Python Programs\Excel Python\Conceptlist.xlsx"
wb1 = xl.load_workbook(filename)
ws1 = wb1.worksheets[0]
# opening the destination excel file
filename1 ="c:\\Users\kevin\Documents\Python Programs\Excel Python\Conceptlist2.xlsx"
wb2 = xl.load_workbook(filename1)
ws2 = wb2.worksheets[0]
# copying the cell values from source
# excel file to destination excel file
rowctsq = ws1['A1']
j = 0
k = 0
while j < rowctsq.value and k < 6:
j = j + 3
k = k + 1
for i in range (3 , 6):
# reading cell value from source excel file
# Populates store column
c = ws1.cell(row = i, column = 1)
ws2.cell(row =i + j , column = 1).value = c.value
# Populates concept 'x' column
c = ws1.cell(row = i, column = 1 + k)
ws2.cell(row =i + j , column = 3).value = c.value
# Populates concept name column
c = ws1.cell(row = 2, column = 1 + k)
ws2.cell(row =i + j , column = 2).value = c.value
# saving the destination excel file
wb2.save('c:\\Users\kevin\Documents\Python Programs\Excel Python\Conceptlist2.xlsx')

Populate different cells in a column with different values using a "for" loop

I'm trying to populate the first 9 cells in a first row with different values in an excel spreadhseet. The code as is populates the first 9 cells as expected BUT instead of populating each of the cells with "j" variable string values - "a","b","c","d","e" in each of the cells it populates all 9 cells with only last value - "e". How can I make the code to iterate through the string assigned in "j" and populate the cells in the spreadsheet with each of the string letters?
Python version 3.6,
IDE: Pycharm
Here is the code:
import xlsxwriter
workbook = xlsxwriter.Workbook("test.xlsx")
worksheet = workbook.add_worksheet()
for h in range(0, 9): #Cell position generator
u = 1
cell_position = (u + h)
g = "A"
f = str(cell_position)
iterated_cell_position = [g+f]#puts cell positions in a list
j = "abcde"
for p in iterated_cell_position:
for e in j:
worksheet.write(p, e)
workbook.close()
Please help me with this?
Thank you.
your iterated_cell_position is an array of one element, and the line
for e in j:
worksheet.write(p, e)
just writes each letter to the same cell. So you write a to the cell, then b to the cell, then c and so on. Try
import xlsxwriter
workbook = xlsxwriter.Workbook("test.xlsx")
worksheet = workbook.add_worksheet()
j = "abcde"
for h in range(0, 9): #Cell position generator
e = j[h % 5] # gets the correct letter in j (wraps around when h gets too large)
cell_position = "A{}".format(h + 1)
worksheet.write(cell_position, e)
workbook.close()

Increment column i element by values inside list

I have a loop that searches keywords in a text file and pastes integers that follow the keywords into an excel file. I want the integers to be in specific cells in the excel file. Is it possible to increment i by the values in i_list rather than always 5 like in the example?
i_list = [5,3,1,1]
def search(file, excel_file):
i = 2
found_keywords = list()
wb = load_workbook(excel_file)
sheets = wb.sheetnames
sheet1 = wb[sheets[0]]
for kwrd in keywords:
for line in file:
if kwrd in line and kwrd not in found_keywords:
found_keywords.append(kwrd)
sheet1.cell(row=3, column=i).value = int(re.search(r"\d+", line).group())
i += 5
elif kwrd in line:
continue
wb.save(excel_file)
If you dont need to cycle through the values of i then you can just create a generator to return you the values of i one by one. I have wrapped the calling of the next value of i in a try block since once you run out of values the code wouldnt know what to do. so we break the loop
i_list = (i for i in [5,3,1,1])
i = 2
for _ in range(10):
print(i)
try:
i += next(i_list)
except StopIteration as si:
print("no more values in i so loop terminating")
break
OUTPUT
2
7
10
11
12
no more values in i so loop terminating
However if you want to cycle thorugh the values of i you can use cycle from itertools module and infintly take the next item from i_list for as long as you need
from itertools import cycle
i_list = cycle([5,3,1,1])
i = 2
for _ in range(10):
print(i)
i += next(i_list)
OUTPUT
2
7
10
11
12
17
20
21
22
27
UPDATE OF YOUR CODE
below is an update of your code based on the fact you said you dont have to cycle. Remember that once you reach the end of i_list your code will not be able to increase i since there are no more values in i_list.
i_list = [5,3,1,1]
i_generator = (i for i in i_list)
def search(file, excel_file):
i = 2
found_keywords = list()
wb = load_workbook(excel_file)
sheets = wb.sheetnames
sheet1 = wb[sheets[0]]
for kwrd in keywords:
for line in file:
if kwrd in line and kwrd not in found_keywords:
found_keywords.append(kwrd)
sheet1.cell(row=3, column=i).value = int(re.search(r"\d+", line).group())
i += next(i_generator)
elif kwrd in line:
continue
wb.save(excel_file)
This code snippet, adjusted from the original, cycles through the values of i_list:
i_list = [5,3,1,1]
def search(file, excel_file):
i = 2
found_keywords = list()
wb = load_workbook(excel_file)
sheets = wb.sheetnames
sheet1 = wb[sheets[0]]
for kwrd in keywords:
for line in file:
if kwrd in line and kwrd not in found_keywords:
for i in i_list: # Update i based on the i-list value
sheet1.cell(row=3, column=i).value = int(re.search(r"\d+", line).group())
elif kwrd in line:
continue
wb.save(excel_file)

Comparing 2 text files

I have 2 text files I wants to compare column 3 and 4 t only if column 1 and 2 is the same in the files .
Text 1 :
12345,67890,4.6,5.7
89736,62828,5.1,4.2
63793,38392,5.4,7.3
Text 2:
12345,67890,4.6,5.7
63793,38392,5.4,7.3
My code :
pre = open ("g.txt","r")
post = open ("g2.utm","r")
line = pre.readlines()
if not line:
break
if line.startswith("L"):
print ("\n") #to avoid the header
else :
v = line[0:5]
l = line[6:11]
i = line[12:14]
k = line[15:17]
line2 = post.readlines()
if not line2:
break
if line2.startswith("L"):
print ("\n") #to avoid the header
else :
v2 = line[0:5]
l2 = line[6:11]
i2 = line[12:14]
k2 = line[15:17]
if v == v2 and l == l2 :
d = (i - i2)
h = (k - k2)
if d >= 6.25 and h >=6.25:
print (v2,l2,"not ok")
print ("Done")
Your code is too much repetitive and messy. Let me suggest you some modification in your code. First read the file line by line. How could you do that?
with open("g.txt","r") as f:
for line in f:
a_line_of_the_file = line
Next, instead of accessing the values with index, you can split them with commas and save them to a list.
valuelist = a_line_of_the_file.split(',')
# contains ["12345","67890","4.6","5.7"] at first iteration.
When you have two list from each row of two files, you can always compare them by index like:
if valuelist1[0]== valueList2[0]:
do_something
Cast the value first if you need another datatype.
You should now solve your problem yourselves. If you still got error, plz inform.

Categories