Check for empty excel cell (Python) - python

I understand there are similar questions but I don't quite understand their answers. Any help is appreciated.
import xlrd
path = ('E:\clean.xlsx')
wb = xlrd.open_workbook(path)
sheet = wb.sheet_by_index(0)
is_empty = None
if (sheet.cell(row=0, column=0).value) == None:
print("empty")
How do I check for a specific cell in my excel file and check if it is empty? Thanks.

You can check the value using sheet.cell_value(rowx=row, colx=col). However the sheet is an array. As a result the code will error if the value is null. Therefore you'll need to check the indexes are inside the array using sheet.nrows and sheet.ncols
I'm sure theres better way of checking null or empty. However the following should work: if sheet.nrows > row and sheet.ncols > col and sheet.cell_value(rowx=row, colx=col):
Example:
import xlrd
path = ('E:\clean.xlsx')
wb = xlrd.open_workbook(path)
sheet = wb.sheet_by_index(0)
def is_null_or_empty(sheet, row, col):
return True if sheet.nrows > row and sheet.ncols > col and sheet.cell_value(rowx=row, colx=col) else False
print(f'has value for row {0} col {0}: {is_null_or_empty(sheet, 0, 0)}')
print(f'has value for row {0} col {1}: {is_null_or_empty(sheet, 0, 1)}')
Output:
has value for row 0 col 0: False
has value for row 0 col 1: True

Related

copy value to certain column in different sheet using openpyxl

I am trying to get the column value from worksheet1 to worksheets2(in specific column), while skipping all the nul/None value in between. My code worked when I printed out all the values in worksheet1 column, exluding all the nul values. However when I saved it to worksheet2, it only showed the last value and duplicate that to the whole column(from row 2 to 20).
Don't know why only last value was written in the new column
from openpyxl import Workbook
from openpyxl import load_workbook
source_file = (r'XXX(Source file).xlsx')
dest_file = (r'XXX(dest file).xlsx')
wb1=load_workbook(source_file, data_only=True)
wb1.active=0
ws1=wb1.active
wb2=load_workbook(dest_file)
wb2.active=0
ws2=wb2.active
for a in range(9,43):
cell2 = ws1.cell(row = a, column = 10)
if cell2.value is None or cell2.value == 0:
continue
else:
print(cell2.value)
for b in range(2,20):
ws2.cell(row = b, column=4).value = cell2.value
wb2.save(dest_file)
Your second loop is nested so that it will always overwrite all values in the column of the second sheet with the same value from the first.
I'd do something like this:
idx = 2
for row in ws1.iter_rows(min_row=9, max_row=43, min_col=10, max_col=10):
cell = row[0]
if not cell.value:
ws2.cell(row=idx, column=4, value=cell.value)
idx += 1

Python: count number of empty cells in excel sheet of data

I'm relatively new to Python, and I'm attempting to count the number of empty cells in an excel sheet filled with data. To test the program, I've been deleting some values so that the cells are empty: my code is below
import xlrd
import pandas as pd
import openpyxl
df = pd.read_excel('5train.xls')
workbook = xlrd.open_workbook('5train.xls')
worksheet = workbook.sheet_by_name('5train')
#Task starts here
empty = 0
row_data = worksheet.nrows - 1
row = 0
cell = 0
while row < row_data:
if worksheet.cell(0, 0).value == xlrd.empty_cell.value:
empty += 1
cell += 1
else:
pass
row += 1
print("Number of empty cells in data sheet:", empty)
However, the code will consistently print "Number of empty cells in data sheet: 0" no matter how many cells I empty. Any pointers? Thank you!
You always check the same cell in your loop:
if worksheet.cell(0, 0).value == xlrd.empty_cell.value:
Only the cell in row 0 and columns 0 is checked if it is empty.
You can iterate over each row through the last row that contains data using .get_rows(), then count the empty cells by checking the value of each cell in each row.
workbook = xlrd.open_workbook('5train.xls')
worksheet = workbook.sheet_by_name('5train')
empty_cells = 0
for row in worksheet.get_rows():
empty_cells += sum(0 if c.value else 1 for c in row)
If you want to make it a one-liner, you can use:
empty_cells = sum(0 if c.value else 1 for row in worksheet.get_rows() for c in row)

Get excel cell address in Python 2 using openpyxl

I have the following scenario:
Read the excel file of column A for all active rows
Run if else statement: if column A == 'registered', do nothing and just read the next row. Else, get cell address (not value) and do some actions
For example, here is my "sample.xlsx":
[row 1] A1=registered B1=user1
[row 2] A2=registered B2=user2
[row 3] A3=null B3=user3
my code should ignore row1 and row2 and proceed with row 3. In row 3, I should get "A3" and do some actions.
from openpyxl import load_workbook
book = load_workbook(filename='sample.xlsx')
sheet = book.active
first_column = sheet['A']
for x in xrange(len(first_column)):
status = first_column[x].value
if status == 'registered':
#enter code to just proceed with the next row
else:
#enter code to get the column cell range and do some actions
book.save('sample.xlsx')
I got stucked with the scenario 2, I am a newbie in python, I have a code in java using FileScanner but I need it in Python 2. Appreciate your help.
UPDATE:
I already have an answer below but just wondering if there are any other method??
Have a look
import openpyxl
wb = openpyxl.load_workbook('sample.xlsx')
ws = wb.active
for row in ws.iter_rows('A{}:A{}'.format(ws.min_row,ws.max_row)):
for cell in row:
if cell.value != "registered":
print cell.column
print cell.row
## further processing here #######
No need to write logic for explicit row iteration this for loop will take care of that.
Hope it will help you :)
so far, I have tried these. is there any other method?
from openpyxl import load_workbook
book = load_workbook(filename='sample.xlsx')
sheet = book.active
first_column = sheet['A']
checker = 'false'
for x in xrange(len(first_column)):
status = first_column[x].value
xaddress = first_column[x].column
yaddress = first_column[x].row
celladdress = xaddress+str(yaddress)
if status != 'registered':
print celladdress
checker == 'true'
break
if checker == 'true':
#... my actions here ....
Here is my "sample.xlsx":
[row 1] A1=registered B1=user1
[row 2] A2=registered B2=user2
[row 3] A3=null B3=user3
Output: A3
It always helps to read the documentation
for cell in ws['A']:
pass

Openpyxl: How to copy a row after checking if a cell contains specific value

I have a worksheet that is updated every week with thousands of rows and would need to transfer rows from this worksheet after filtering. I am using the current code to find the cells which has the value I need and then transfer the entire row to another sheet but after saving the file, I get the "IndexError: list index out of range" exception.
The code I use is as follows:
import openpyxl
wb1 = openpyxl.load_workbook('file1.xlsx')
wb2 = openpyxl.load_workbook('file2.xlsx')
ws1 = wb1.active
ws2 = wb2.active
for row in ws1.iter_rows():
for cell in row:
if cell.value == 'TrueValue':
n = 'A' + str(cell.row) + ':' + ('GH' + str(cell.row))
for row2 in ws1.iter_rows(n):
ws2.append(row2)
wb2.save("file2.xlsx")
The original code I used that used to work is below and has to be modified because of the large files which causes MS Excel not to open them (over 40mb).
n = 'A3' + ':' + ('GH'+ str(ws1.max_row))
for row in ws1.iter_rows(n):
ws2.append(row)
Thanks.
I'm not entirely sure what you're trying to do but I suspect the problem is that you have nested your copy loop.
Try the following:
row_nr = 1
for row in ws1:
for cell in row:
if cell.value == "TrueValue":
row_nr = cell.row
break
if row_nr > 1:
break
for row in ws1.iter_rows(min_row=row_nr, max_col=190):
ws2.append((cell.value for cell in row))
Question: I get the "IndexError: list index out of range" exception.
I get, from ws1.iter_rows(n)
UserWarning: Using a range string is deprecated. Use ws[range_string]
and from ws2.append(row2).
ValueError: Cells cannot be copied from other worksheets
The Reason are row2 does hold a list of Cell objects instead of a list of Values
Question: ... need to transfer rows from this worksheet after filtering
The following do what you want, for instance:
# If you want to Start at Row 2 to append Row Data
# Set Private self._current_row to 1
ws2.cell(row=1, column=1).value = ws2.cell(row=1, column=1).value
# Define min/max Column Range to copy
from openpyxl.utils import range_boundaries
min_col, min_row, max_col, max_row = range_boundaries('A:GH')
# Define Cell Index (0 Based) used to Check Value
check = 0 # == A
for row in ws1.iter_rows():
if row[check].value == 'TrueValue':
# Copy Row Values
# We deal with Tuple Index 0 Based, so min_col must have to be -1
ws2.append((cell.value for cell in row[min_col-1:max_col]))
Tested with Python: 3.4.2 - openpyxl: 2.4.1 - LibreOffice: 4.3.3.2
Use a list to hold the items in each column for the particular row.
Then append the list to your ws2.
...
def iter_rows(ws,n): #produce the list of items in the particular row
for row in ws.iter_rows(n):
yield [cell.value for cell in row]
for row in ws1.iter_rows():
for cell in row:
if cell.value == 'TrueValue':
n = 'A' + str(cell.row) + ':' + ('GH' + str(cell.row))
list_to_append = list(iter_rows(ws1,n))
for items in list_to_append:
ws2.append(items)
I was able to solve this with lists for my project.
import openpyxl
#load data file
wb1 = openpyxl.load_workbook('original.xlsx')
sheet1 = wb1.active
print("loaded 1st file")
#new template file
wb2 = openpyxl.load_workbook('blank.xlsx')
sheet2 = wb2.active
print("loaded 2nd file")
header = sheet1[1:1] #grab header row
listH =[]
for h in header:
listH.append(h.value)
sheet2.append(listH)
colOfInterest= 11 # this is my col that contains the value I'm checking against
for rowNum in range(2, sheet1.max_row +1): #iterate over each row, starting with 2 to skipping header from original file
if sheet1.cell(row=rowNum, column=colOfInterest).value is not None: #interested in non blank values in column 11
listA = [] # list which will hold my data
row = sheet1[rowNum:rowNum] #creates a tuple of row's data
#print (str(rowNum)) # for debugging to show what rows are copied
for cell in row: # for each cell in the row
listA.append(cell.value) # add each cell's data as an element in the list
if listA[10] == 1: # condition1 I'm checking for by looking up the index in the list
sheet2.append(listA) # appending the sheet2's next available row
elif listA[10] > 1: # condition2 I'm checking for by looking up the index in the list
# do something else and store it in bar
sheet2.append(bar) # appending the sheet2's next available row
print("saving file...")
wb2.save('result.xlsx') # save file
print("Done!")
Tested with: Python 3.7 openpyxl 2.5.4

How to use field name or column header in openpyxl?

See my code below. This code works very well, but I would like to do two things. One thing is I made if statement with or much shorter than actual for example. I have many columns like this, not all next to each other. I would like it to be shorter. Also, sometimes I may not know exact column letter.
So I want to know if there is a way to know the column name or header. Like the values that would be in very top row. So I can test to see if it is one of those values to always perform function on that cell if it's in the specified column.
I can't find openpyxl function to do column name. Not sure if it understands that first row is different than rest. I think maybe if not I can try to do test on first row, but don't understand how to make this.
So is there a way to call column name? or if there is no way to call column name to test, can someone help me with doing check on first row to see if it has value? then do change on correct row I'm in? Does this make sense.
So instead of code saying:
if cellObj.column == 'H' or ...
It would say:
if cellObj.column_header == 'NameOfField or ...
Or if not possible to do that, then:
if this cell has column where first row value is 'NameOfField' ...
Please help with best way to do this. I have looked on stackoverflow and in book and blog site, but does not seem to be a way to call column name (not the letter of column).
for row in sheet.iter_rows():
for cellObj in row:
if cellObj.column == 'H' or cellObj.column == 'I' or cellObj.column == 'L' or cellObj.column == 'M':
print(cellObj.value),
if cellObj.value.upper() == 'OldValue1':
cellObj.value = 1
print(cellObj.value)
elif cellObj.value.upper() == 'OldValue2':
cellObj.value = 2
print(cellObj.value)
EDIT
Assuming these are the header names you are looking for:
colnames = ['Header1', 'Header2', 'Header3']
Find the indices for these columns:
col_indices = {n for n, cell in enumerate(sheet.rows[0]) if cell.value in colnames}
Now iterate over the remain rows:
for row in sheet.rows[1:]:
for index, cell in enumerate(row):
if index in col_indices:
if cell.value.upper() == 'OldValue1':
cell.value = 1
print(cell.value)
elif cell.value.upper() == 'OldValue2':
cell.value = 2
print(cell.value)
Use a dictionary instead of a set to keep the column names around:
col_indices = {n: cell.value for n, cell in enumerate(sheet.rows[0])
if cell.value in colnames}
for row in sheet.rows[1:]:
for index, cell in enumerate(row):
if index in col_indices:
print('col: {}, row: {}, content: {}'.format(
col_indices[index], index, cell.value))
if cell.value.upper() == 'OldValue1':
cell.value = 1
elif cell.value.upper() == 'OldValue2':
cell.value = 2
Old answer
This makes your if statement shorter:
if cellObj.column in 'HILM':
print(cellObj.value),
For multi letter column coordinates you need to use a list:
if cellObj.column in ['H', 'AA', 'AB', 'AD']:
print(cellObj.value),
You can use a dictionary object to store the key-value pairs for your data, where the key will be the header for each column, and the value will be the particular column value. You can then append these dictionary objects to a list and access them using a for loop and normal dictionary syntax.
For example:
Assuming "my_workbook" is an excel workbook with the following column headers and values stored in the first worksheet:
Name Class Age John 1 12 Andrew 1 12 Jane 2
13
Load the workbook and get values only:
from openpyxl import load_workbook
wb = load_workbook('./my_workbook.xlsx')
ws = wb.worksheets[0].values
header = next(ws) #get the header row
my_data = []
Organise the data into a dictionary structure:
for row in ws:
my_data.append(dict(zip(header, row))
You can then access the columns of each row using the headers as keys:
for data in my_data:
print(data['Name'], data['Class'], data['Age'])
This will output:
John 1 12
Andrew 1 12
Jane 2 13
As a final note, using a dictionary structure to store and access your data makes your code more readable, as opposed to using indices, and allows you to re-arrange the columns in the excel file without having to modify your code. Hope this helps. 😊
You can access cells from the first row and and column using the sheet.cell(row=#, column = #) syntax. For example:
for row in enumerate(sheet.iter_rows()):
for j, cellObj in enumerate(row):
header_cell = sheet.cell(row=1, column=j)
if cellObj.column in ['H', 'I', 'L', 'M', 'AA', 'AB']:
print(cellObj.value),
if cellObj.value.upper() == 'OldValue1':
cellObj.value = 1
print(cellObj.value)
elif cellObj.value.upper() == 'OldValue2':
cellObj.value = 2
print(cellObj.value)
Since row returns a generator, you can easily extract headers in the first iteration, treat them as you need, and then continue to consume it. For instance:
headers = [cell.value for cell in next(sheet.rows)]
# find indexes of targeted columns
cols = [headers.index(header) for header in 'HILM']
conv = {'OldValue1': 1, 'OldValue2': 2}
for row in sheet.rows:
values = [cell.value for cell in row]
for col in cols:
values[col] = conv[values[col]]
You have many ways to do this. some approach that i used:
1. Brute force
Assuming "sheet" and "workbook" are defined.
header = [cell for cell in sheet['A1:XFD1'][0] if cell.value is not None and cell.value.strip() != ''] #you get all non-null columns
target_values = ['NameOfField', 'NameOfField1', 'NameOfField2'] #filter list
target_header = [cell.column for cell in header if cell.value in target_values] #get column index
data = {'OldValue1': 1, 'OldValue2': 2}
for row in sheet.iter_rows(max_row=sheet.max_row, max_col=sheet.max_column):
for cell in row:
if cell.column in target_header and cell.value in data :
cell.value = data[cell.value]
In this case, the brute force is in "sheet['A1:XFD1']". we have to check for all columns the first time. But you'll get all cells references for columns. After that, we create target_values (our columns names...) and we create a list with column index (target_header). Finally we iterated over sheet. We check if the cell's column is in the column index and check if the cell's value is in data, so we're able to change the value.
Downside:if exists cell with random whitespace outside "data area". max_row and max_column will consider that cells (iterate over blank cells).
2. Check for bundaries
You can use your own max row and max column if the data has table form(no empty space between columns, a column with "id"-> not null, not whitespace).
from openpyxl.utils import get_column_letter
def find_limit_sheet(direction):
max_limit_value = 1
while (direction(max_limit_value).value is not None) and (direction(max_limit_value).value.strip() != ''):
max_limit_value = max_limit_value + 1
return (max_limit_value - 1) if max_limit_value != 1 else 1
max_qrow = find_limit_sheet(direction=lambda increment: sheet.cell(row=increment, column=1))
max_qcolumn = find_limit_sheet(direction=lambda increment: sheet.cell(column=increment, row=1))
header = [cell for cell in sheet[f'A1:{get_column_letter(max_qcolumn)}1']] #you get all non-null columns
target_values = ['NameOfField', 'NameOfField1', 'NameOfField2'] #filter list
target_header = [cell.column for cell in header[0] if cell.value in target_values] #get column names
data = {'OldValue1': 1, 'OldValue2': 2}
for row in sheet.iter_rows(max_row=max_qrow, max_col=max_qcolumn):
for cell in row:
if cell.column in target_header and cell.value in data :
cell.value = data[cell.value]
In this case we are inside "data area" only.
3. Optional: Using Pandas
If you need more complex operation on excel data(i have to read a lots of excel in my work :( as data source). I prefer convert to pandas dataframe-> make operation -> save result .
In this case we use all the data.
from openpyxl.utils import get_column_letter
import pandas as pd
def find_limit_sheet(direction):
max_limit_value = 1
while (direction(max_limit_value).value is not None) and (direction(max_limit_value).value.strip() != ''):
max_limit_value = max_limit_value + 1
return (max_limit_value - 1) if max_limit_value != 1 else 1
max_qrow = find_limit_sheet(direction=lambda increment: sheet.cell(row=increment, column=1))
max_qcolumn = find_limit_sheet(direction=lambda increment: sheet.cell(column=increment, row=1))
header = [cell.value for cell in sheet[f'A1:{get_column_letter(max_qcolumn)}1'][0]] #you get all non-null columns
raw_data = []
for row in sheet.iter_rows(max_row=max_qrow, max_col=max_qcolumn):
row_data = [cell.value for cell in row]
raw_data.append(dict(zip(header, row_data)))
df = pandas.DataFrame(raw_data)
df.columns = df.iloc[0]
df = df[1:]
You can also use a sub-set of columns using target_data for example 2.
...
target_header = [cell.column for cell in header[0] if cell.value in target_values] #get column names
...
raw_data = []
for row in sheet.iter_rows(max_row=max_qrow, max_col=max_qcolumn):
row_data = [cell.value for cell in row if cell.column in target_header]
raw_data.append(dict(zip(header, row_data)))
df = pd.DataFrame(raw_data)
df.columns = df.iloc[0]
df = df[1:]
...
INFO
openpyxl: 2.6.2
pandas: 0.24.2
python: 3.7.3
Data Structures: List Comprehensions doc
lambda expr: lambda expression

Categories