Word count of a row in excel file using python - python

I have an excel file with multiple columns. In one column I have different comments. I want to create a column just beside it to find the number of words in the comment columns using python code. Is there any possibility.

Try this:
import xlrd
import os
from string import punctuation, translate
from collections import Counter
filename = u'test.xlsx'
sheet_no = 1 # To get the first sheet of the workbook
path = 'C:\Users\myUsername\Directory for Excel files'
punctuation_map = dict((ord(c), u' ') for c in punctuation)
for filename in os.listdir(path):
if filename.endswith('.xlsx'):
print filename
workbook = xlrd.open_workbook(filename)
sheet = workbook.sheet_by_index(sheet_no)
values = []
for row in range(sheet.nrows):
for col in range(sheet.ncols):
c = sheet.cell(row, col)
if c.ctype == xlrd.XL_CELL_TEXT:
cv = unicode(c.value)
wordlist = cv.translate(punctuation_map).split()
values.extend(wordlist)
numberWords = Counter(wordlist)
print sum(numberWords.values()), ' words for that column'
count = Counter(values)
print sum(count.values()), ' total words counted (from all columns)'

import pandas as pd
df #is your dataframe
counter = [] #future column you want
for string in df.Comments.values: #for each string in your "Comments"
counter.append(string.count(' ') + 1) #num of spaces + 1
df['num_words'] = counter #add the column
df = df[['num_words', 'Comments']] #change the order of columns
my df was
my df
and I finally got this df

Related

How to use python to seperate a one column CSV file if the columns have no headings, then save this into a new excel file?

So, I am quite new to python and have been googling a lot but have not found a good solution. What I am looking to do is automate text to columns using python in an excel document without headers.
Here is the excel sheet I have
it is a CSV file where all the data is in one column without headers
ex. hi ho loe time jobs barber
jim joan hello
009 00487 08234 0240 2.0348 20.34829
delimeter is space and comma
What I want to come out is saved in another excel with the first two rows deleted and seperated into columns
( this can be done using text to column in excel but i would like to automate this for several excel sheets)
009 | 00487 | 08234 | 0240 | 2.0348 | 20.34829
the code i have written so far is like this:
import pandas as pd
import csv
path = 'C:/Users/ionan/OneDrive - Universiteit Utrecht/Desktop/UCU/test_excel'
os.chdir(path)
for root, dirs, files in os.walk(path):
for f in files:
df = pd.read_csv(f, delimiter='\t' + ';', engine = 'python')
Original file with name as data.xlsx:
This means all the data we need is under the column Data.
Code to split data into multiple columns for a single file:
import pandas as pd
import numpy as np
f = 'data.xlsx'
# -- Insert the following code in your `for f in files` loop --
file_data = pd.read_excel(f)
# Since number of values to be split is not known, set the value of `num_cols` to
# number of columns you expect in the modified excel file
num_cols = 20
# Create a dataframe with twenty columns
new_file = pd.DataFrame(columns = ["col_{}".format(i) for i in range(num_cols)])
# Change the column name of the first column in new_file to "Data"
new_file = new_file.rename(columns = {"col_0": file_data.columns[0]})
# Add the value of the first cell in the original file to the first cell of the
# new excel file
new_file.loc[0, new_file.columns[0]] = file_data.iloc[0, 0]
# Loop through all rows of original excel file
for index, row in file_data.iterrows():
# Skip the first row
if index == 0:
continue
# Split the row by `space`. This gives us a list of strings.
split_data = file_data.loc[index, "Data"].split(" ")
print(split_data)
# Convert each element to a float (a number) if we want numbers and not strings
# split_data = [float(i) for i in split_data]
# Make sure the size of the list matches to the number of columns in the `new_file`
# np.NaN represents no value.
split_data = [np.NaN] + split_data + [np.NaN] * (num_cols - len(split_data) - 1)
# Store the list at a given index using `.loc` method
new_file.loc[index] = split_data
# Drop all the columns where there is not a single number
new_file.dropna(axis=1, how='all', inplace=True)
# Get the original excel file name
new_file_name = f.split(".")[0]
# Save the new excel file at the same location where the original file is.
new_file.to_excel(new_file_name + "_modified.xlsx", index=False)
This creates a new excel file (with a single sheet) of name data_modified.xlsx:
Summary (code without comments):
import pandas as pd
import numpy as np
f = 'data.xlsx'
file_data = pd.read_excel(f)
num_cols = 20
new_file = pd.DataFrame(columns = ["col_{}".format(i) for i in range(num_cols)])
new_file = new_file.rename(columns = {"col_0": file_data.columns[0]})
new_file.loc[0, new_file.columns[0]] = file_data.iloc[0, 0]
for index, row in file_data.iterrows():
if index == 0:
continue
split_data = file_data.loc[index, "Data"].split(" ")
split_data = [np.NaN] + split_data + [np.NaN] * (num_cols - len(split_data) - 1)
new_file.loc[index] = split_data
new_file.dropna(axis=1, how='all', inplace=True)
new_file_name = f.split(".")[0]
new_file.to_excel(new_file_name + "_modified.xlsx", index=False)

split list values from openpyxl output in python

I am working with openpyxl in python to pull specific column data which has comma separated values in few cells. Output i get 'CVE-2021-1111, CVE-2021'. The output required is 'CVE-2021-1111', 'CVE-2021'.
Please help me
Input from excel column:(large data truncated here)
CVE-2017-12652
CVE-2020-12243
CVE-2019-14866
CVE-2019-16935
CVE-2019-17493
CVE-2021-1111, CVE-2021
from openpyxl import Workbook, load_workbook
senior = load_workbook("C:\senior.xlsx")
sr = senior.active
range1 = sr['B2':'B15']
csv1 = []
for names in range1:
for cell in names:
if cell.value != None:
csv1.append(cell.value)
print(csv1)
Output i get:
['CVE-2017-12652', 'CVE-2020-12243', 'CVE-2019-14866', 'CVE-2019-16935', 'CVE-2019-17493', 'CVE-2021-1111, CVE-2021']
Required output (last 2 values should be split)
['CVE-2017-12652', 'CVE-2020-12243', 'CVE-2019-14866', 'CVE-2019-16935', 'CVE-2019-17493', 'CVE-2021-1111', 'CVE-2021']
As you guessed, you will need to split the cell.value by the expected commas. Also you can remove the spaces in the strings before appending to csv1 to return the required output.
from openpyxl import Workbook, load_workbook
senior = load_workbook("C:\senior.xlsx")
sr = senior.active
range1 = sr['B2':'B15']
csv1 = []
for names in range1:
for cell in names:
if cell.value != None:
cell_value = cell.value.replace(' ', '').split(',')
csv1 += cell_value if isinstance(cell_value, list) else [cell_value]
EDIT: Based on the comment I added a simpler version, instead of filling csv1 in one line.
from openpyxl import Workbook, load_workbook
senior = load_workbook("C:\senior.xlsx")
sr = senior.active
range1 = sr['B2':'B15']
csv1 = []
for names in range1:
for cell in names:
if cell.value != None:
cell_value = cell.value.replace(' ', '').split(',')
if isinstance(cell_value, list):
csv1 += cell_value
else:
csv1.append(cell_value)

How to return a row from xlsx file based on items in a list

I have a list of phrases that I want to use to retrieve specific rows/cells form an xlsx file. The list values are always spelled exactly how the English column on the database is.
I need each sentence - in different language, to be put into their own list that can be outputted into a different excel file looking like this:
# importing openpyxl module
from openpyxl import load_workbook
import openpyxl
# Give the location of the file
path = "C:/Users/username/Desktop/ExcelTest.xlsx"
# To open the workbook
# workbook object is created
wb_obj = openpyxl.load_workbook(path)
# Get workbook active sheet object
# from the active attribute
sheet_obj = wb_obj.active
max_col = sheet_obj.max_column
m_row = sheet_obj.max_row
eng = []
fre = []
ger = []
spa = []
ita = []
list = ['Hello', 'I Love', 'Python']
for row in sheet_obj:
for a in list:
for cell in row:
if cell.value == a:
#print('Row:', cell.row , 'Column:', cell.column, 'Value:', cell.value)
for i in range(1, 5):
cell_obj = sheet_obj.cell(row = cell.row, column = i)
print(cell_obj.value, end= ' ')
eng.append(cell_obj.value)
break
print (eng)
Now the result i get from the code is partly correct - except the whole thing (every sentence from different languages is put into one list instead)
You can use pandas for this
import pandas as pd
path = "C:/Users/username/Desktop/ExcelTest.xlsx"
df = pd.read_excel(path)
languages = ['English', 'French', 'German', 'Spanish', 'Italian']
for language in languages:
print(df[language].values)

Using pandas Combining/merging 2 different Excel files/sheets

I am trying to combine 2 different Excel files. (thanks to the post Import multiple excel files into python pandas and concatenate them into one dataframe)
The one I work out so far is:
import os
import pandas as pd
df = pd.DataFrame()
for f in ['c:\\file1.xls', 'c:\\ file2.xls']:
data = pd.read_excel(f, 'Sheet1')
df = df.append(data)
df.to_excel("c:\\all.xls")
Here is how they look like.
However I want to:
Exclude the last rows of each file (i.e. row4 and row5 in File1.xls; row7 and row8 in File2.xls).
Add a column (or overwrite Column A) to indicate where the data from.
For example:
Is it possible? Thanks.
For num. 1, you can specify skip_footer as explained here; or, alternatively, do
data = data.iloc[:-2]
once your read the data.
For num. 2, you may do:
from os.path import basename
data.index = [basename(f)] * len(data)
Also, perhaps would be better to put all the data-frames in a list and then concat them at the end; something like:
df = []
for f in ['c:\\file1.xls', 'c:\\ file2.xls']:
data = pd.read_excel(f, 'Sheet1').iloc[:-2]
data.index = [os.path.basename(f)] * len(data)
df.append(data)
df = pd.concat(df)
import os
import os.path
import xlrd
import xlsxwriter
file_name = input("Decide the destination file name in DOUBLE QUOTES: ")
merged_file_name = file_name + ".xlsx"
dest_book = xlsxwriter.Workbook(merged_file_name)
dest_sheet_1 = dest_book.add_worksheet()
dest_row = 1
temp = 0
path = input("Enter the path in DOUBLE QUOTES: ")
for root,dirs,files in os.walk(path):
files = [ _ for _ in files if _.endswith('.xlsx') ]
for xlsfile in files:
print ("File in mentioned folder is: " + xlsfile)
temp_book = xlrd.open_workbook(os.path.join(root,xlsfile))
temp_sheet = temp_book.sheet_by_index(0)
if temp == 0:
for col_index in range(temp_sheet.ncols):
str = temp_sheet.cell_value(0, col_index)
dest_sheet_1.write(0, col_index, str)
temp = temp + 1
for row_index in range(1, temp_sheet.nrows):
for col_index in range(temp_sheet.ncols):
str = temp_sheet.cell_value(row_index, col_index)
dest_sheet_1.write(dest_row, col_index, str)
dest_row = dest_row + 1
dest_book.close()
book = xlrd.open_workbook(merged_file_name)
sheet = book.sheet_by_index(0)
print "number of rows in destination file are: ", sheet.nrows
print "number of columns in destination file are: ", sheet.ncols
Change
df.to_excel("c:\\all.xls")
to
df.to_excel("c:\\all.xls", index=False)
You may need to play around with the double quotes, but I think that will work.

Reading Excel File using Python, how do I get the values of a specific column with indicated column name?

I've an Excel File:
Arm_id DSPName DSPCode HubCode PinCode PPTL
1 JaVAS 01 AGR 282001 1,2
2 JaVAS 01 AGR 282002 3,4
3 JaVAS 01 AGR 282003 5,6
I want to save a string in the form Arm_id,DSPCode,Pincode. This format is configurable, i.e. it might change to DSPCode,Arm_id,Pincode. I save it in a list like:
FORMAT = ['Arm_id', 'DSPName', 'Pincode']
How do I read the content of a specific column with provided name, given that the FORMAT is configurable?
This is what I tried. Currently I'm able to read all the content in the file
from xlrd import open_workbook
wb = open_workbook('sample.xls')
for s in wb.sheets():
#print 'Sheet:',s.name
values = []
for row in range(s.nrows):
col_value = []
for col in range(s.ncols):
value = (s.cell(row,col).value)
try : value = str(int(value))
except : pass
col_value.append(value)
values.append(col_value)
print values
My output is :
[
[u'Arm_id', u'DSPName', u'DSPCode', u'HubCode', u'PinCode', u'PPTL'],
['1', u'JaVAS', '1', u'AGR', '282001', u'1,2'],
['2', u'JaVAS', '1', u'AGR', '282002', u'3,4'],
['3', u'JaVAS', '1', u'AGR', '282003', u'5,6']
]
Then I loop around values[0] trying to find out the FORMAT content in values[0] and then getting the index of Arm_id, DSPname and Pincode in the values[0] and then from next loop I know the index of all the FORMAT factors , thereby getting to know which value do I need to get .
But this is such a poor solution.
How do I get the values of a specific column with name in excel file?
A somewhat late answer, but with pandas, it is possible to get directly a column of an excel file:
import pandas
df = pandas.read_excel('sample.xls')
#print the column names
print df.columns
#get the values for a given column
values = df['Arm_id'].values
#get a data frame with selected columns
FORMAT = ['Arm_id', 'DSPName', 'Pincode']
df_selected = df[FORMAT]
Make sure you have installed xlrd and pandas:
pip install pandas xlrd
This is one approach:
from xlrd import open_workbook
class Arm(object):
def __init__(self, id, dsp_name, dsp_code, hub_code, pin_code, pptl):
self.id = id
self.dsp_name = dsp_name
self.dsp_code = dsp_code
self.hub_code = hub_code
self.pin_code = pin_code
self.pptl = pptl
def __str__(self):
return("Arm object:\n"
" Arm_id = {0}\n"
" DSPName = {1}\n"
" DSPCode = {2}\n"
" HubCode = {3}\n"
" PinCode = {4} \n"
" PPTL = {5}"
.format(self.id, self.dsp_name, self.dsp_code,
self.hub_code, self.pin_code, self.pptl))
wb = open_workbook('sample.xls')
for sheet in wb.sheets():
number_of_rows = sheet.nrows
number_of_columns = sheet.ncols
items = []
rows = []
for row in range(1, number_of_rows):
values = []
for col in range(number_of_columns):
value = (sheet.cell(row,col).value)
try:
value = str(int(value))
except ValueError:
pass
finally:
values.append(value)
item = Arm(*values)
items.append(item)
for item in items:
print item
print("Accessing one single value (eg. DSPName): {0}".format(item.dsp_name))
print
You don't have to use a custom class, you can simply take a dict(). If you use a class however, you can access all values via dot-notation, as you see above.
Here is the output of the script above:
Arm object:
Arm_id = 1
DSPName = JaVAS
DSPCode = 1
HubCode = AGR
PinCode = 282001
PPTL = 1
Accessing one single value (eg. DSPName): JaVAS
Arm object:
Arm_id = 2
DSPName = JaVAS
DSPCode = 1
HubCode = AGR
PinCode = 282002
PPTL = 3
Accessing one single value (eg. DSPName): JaVAS
Arm object:
Arm_id = 3
DSPName = JaVAS
DSPCode = 1
HubCode = AGR
PinCode = 282003
PPTL = 5
Accessing one single value (eg. DSPName): JaVAS
So the key parts are to grab the header ( col_names = s.row(0) ) and when iterating through the rows, to skip the first row which isn't needed for row in range(1, s.nrows) - done by using range from 1 onwards (not the implicit 0). You then use zip to step through the rows holding 'name' as the header of the column.
from xlrd import open_workbook
wb = open_workbook('Book2.xls')
values = []
for s in wb.sheets():
#print 'Sheet:',s.name
for row in range(1, s.nrows):
col_names = s.row(0)
col_value = []
for name, col in zip(col_names, range(s.ncols)):
value = (s.cell(row,col).value)
try : value = str(int(value))
except : pass
col_value.append((name.value, value))
values.append(col_value)
print values
By using pandas we can read excel easily.
import pandas as pd
from pandas import ExcelWriter
from pandas import ExcelFile
DataF=pd.read_excel("Test.xlsx",sheet_name='Sheet1')
print("Column headings:")
print(DataF.columns)
Test at :https://repl.it
Reference: https://pythonspot.com/read-excel-with-pandas/
Here is the code to read an excel file and and print all the cells present in column 1 (except the first cell i.e the header):
import xlrd
file_location="C:\pythonprog\xxx.xlsv"
workbook=xlrd.open_workbook(file_location)
sheet=workbook.sheet_by_index(0)
print(sheet.cell_value(0,0))
for row in range(1,sheet.nrows):
print(sheet.cell_value(row,0))
The approach I took reads the header information from the first row to determine the indexes of the columns of interest.
You mentioned in the question that you also want the values output to a string. I dynamically build a format string for the output from the FORMAT column list. Rows are appended to the values string separated by a new line char.
The output column order is determined by the order of the column names in the FORMAT list.
In my code below the case of the column name in the FORMAT list is important. In the question above you've got 'Pincode' in your FORMAT list, but 'PinCode' in your excel. This wouldn't work below, it would need to be 'PinCode'.
from xlrd import open_workbook
wb = open_workbook('sample.xls')
FORMAT = ['Arm_id', 'DSPName', 'PinCode']
values = ""
for s in wb.sheets():
headerRow = s.row(0)
columnIndex = [x for y in FORMAT for x in range(len(headerRow)) if y == firstRow[x].value]
formatString = ("%s,"*len(columnIndex))[0:-1] + "\n"
for row in range(1,s.nrows):
currentRow = s.row(row)
currentRowValues = [currentRow[x].value for x in columnIndex]
values += formatString % tuple(currentRowValues)
print values
For the sample input you gave above this code outputs:
>>> 1.0,JaVAS,282001.0
2.0,JaVAS,282002.0
3.0,JaVAS,282003.0
And because I'm a python noob, props be to:
this answer,
this answer,
this question,
this question
and this answer.
I have read using openpyxl library,
import openpyxl
from pathlib import Path
xlsx_file = Path('C:\\Users\\Amit\\Desktop\\ReadExcel', 'ReadData.xlsx')
wb_obj = openpyxl.load_workbook(xlsx_file)
# Read the active sheet:
sheet = wb_obj.active
for i in range(sheet.max_column):
print(f'i = {i}')
for row in sheet.iter_rows():
print(row[i].value)
Although I almost always just use pandas for this, my current little tool is being packaged into an executable and including pandas is overkill. So I created a version of poida's solution that resulted in a list of named tuples. His code with this change would look like this:
from xlrd import open_workbook
from collections import namedtuple
from pprint import pprint
wb = open_workbook('sample.xls')
FORMAT = ['Arm_id', 'DSPName', 'PinCode']
OneRow = namedtuple('OneRow', ' '.join(FORMAT))
all_rows = []
for s in wb.sheets():
headerRow = s.row(0)
columnIndex = [x for y in FORMAT for x in range(len(headerRow)) if y == headerRow[x].value]
for row in range(1,s.nrows):
currentRow = s.row(row)
currentRowValues = [currentRow[x].value for x in columnIndex]
all_rows.append(OneRow(*currentRowValues))
pprint(all_rows)

Categories