How to slice Excel data using win32com in Python (3.6) - python

I have data in Excel that I want to split.
Here is my code
import win32com.client
excel = win32com.client.Dispatch("Excel.Application")
excel.Visible = True
wb = excel.Workbooks.Open('C:\\Users\\a\\Desktop\\bin256.xlsx')
ws = wb.ActiveSheet
input_number = ws.Range('U4:U15363')
number = [input_number[i:i+60] for i in range(0,len(input_number),60)]
But I get this error:
Traceback (most recent call last):
File "<pyshell#13>", line 1, in <module>
number = [input_number[i:i+60] for i in range(0,len(input_number),60)]
File "<pyshell#13>", line 1, in <listcomp>
number = [input_number[i:i+60] for i in range(0,len(input_number),60)]
File "C:\Users\a\AppData\Local\Programs\Python\Python36-32\lib\site-packages\win32com\client\dynamic.py", line 256, in __getitem__
return self._get_good_object_(self._oleobj_.Invoke(dispid, LCID, invkind, 1, index))
TypeError: Objects of type 'slice' can not be converted to a COM VARIANT
How can I fix that? Thanks in advance.

Did you just want to open excel?
Maybe you can try pandas
import pandas as pd
file_path=r'E:\download\tmp\test.xlsx'
file_path='./tmp/test.xlsx'
df_name.to_excel(file_path,sheet_name='Sheet1',index=False)

Related

citybikes: JSON to Dataframe

I am using python-citybikes (https://pypi.org/project/python-citybikes/) to retrieve some data.
However, I can't figure out a way export the data
import citybikes
import pandas as pd
client = citybikes.Client()
GlasgowNextBike = citybikes.Network(client, uid='nextbike-glasgow')
list(GlasgowNextBike.stations)
Stations = list(GlasgowNextBike.stations)
pd.read_json(Stations)
I am getting
Traceback (most recent call last):
File "<ipython-input-15-5a1904def0e8>", line 1, in <module>
pd.read_json(Stations)
File "/Users/noor/opt/anaconda3/lib/python3.7/site-packages/pandas/util/_decorators.py", line 214, in wrapper
return func(*args, **kwargs)
File "/Users/noor/opt/anaconda3/lib/python3.7/site-packages/pandas/io/json/_json.py", line 585, in read_json
path_or_buf, encoding=encoding, compression=compression
File "/Users/noor/opt/anaconda3/lib/python3.7/site-packages/pandas/io/common.py", line 200, in get_filepath_or_buffer
raise ValueError(msg)
ValueError: Invalid file path or buffer object type: <class 'list'>
My question is :
How can I export/save the results as JSON or CSV file
Try using the json module, like so:
import citybikes, json
client = citybikes.Client()
GlasgowNextBike = citybikes.Network(client, uid='nextbike-glasgow')
with open('GlasgowNextBike.json', 'w') as f:
json.dump(GlasgowNextBike.data, f, indent=2)

error in creating series in pandas

I am getting an error when creating a series in pandas.
Whenever I try to print the series I have created, I get an error.
The code I am running:
import pandas as pd
data2 = [1,2,3,4]
index = ['a','b','c','d']
s = pd.Series(data2, index)
print(s.shape)
s
The error:
Traceback (most recent call last):
File "<pyshell#6>", line 1, in <module>
s
File "C:\Python34\lib\idlelib\rpc.py", line 611, in displayhook
text = repr(value)
File "C:\Python34\lib\site-packages\pandas\core\base.py", line 80, in __repr__
return str(self)
File "C:\Python34\lib\site-packages\pandas\core\base.py", line 59, in __str__
return self.__unicode__()
File "C:\Python34\lib\site-packages\pandas\core\series.py", line 1060, in __unicode__
width, height = get_terminal_size()
File "C:\Python34\lib\site-packages\pandas\io\formats\terminal.py", line 33, in get_terminal_size
return shutil.get_terminal_size()
File "C:\Python34\lib\shutil.py", line 1071, in get_terminal_size
size = os.get_terminal_size(sys.__stdout__.fileno())
AttributeError: 'NoneType' object has no attribute 'fileno'
Your error is related to pyshell, not to pandas.
Try to run it through python directly or jupyter console, because the code you provided is correct.

Python gives KeyError while the key is passed

I am trying to create different python file where the code is given below. While calling the method, I pass the mydata as data frame with these columns
['wage', 'educ', 'exper', 'tenure'].
import pandas as pd
import numpy as np
from prettytable import PrettyTable as pt
def LinearRegressionOLS(mydata,target_column):
if(not isinstance(mydata,pd.DataFrame)):
raise TypeError("Data must be of type Data Frame")
if(not isinstance(target_column,str)):
raise TypeError("target_column must be String")
if(target_column not in mydata.columns):
raise KeyError("target_column doesn't exist in Data Frame")
data=mydata.copy()
data["one"]=np.ones(data.count()[target_column])
column_list=["one"]
for i in data.columns:
column_list.append(i)
Y=data[target_column].as_matrix()
data.drop(target_column,inplace=True,axis=1)
X=data[column_list].as_matrix()
del data
beta = np.matmul(np.matmul(np.linalg.inv(np.matmul(X.T,X)),X.T),Y)
predY = np.matmul(X,beta)
total = np.matmul((Y-np.mean(Y)).T,(Y-np.mean(Y)))
residual = np.matmul((Y-predY).T,(Y-predY))
sigma = np.matmul((Y-predY).T,(Y-predY))/(X.shape[0]-X.shape[1])
omega = np.square(sigma)*np.linalg.inv(np.matmul(X.T,X))
SE = np.sqrt(np.diag(omega))
tstat = beta/SE
Rsq = 1-(residual/total)
final = pt()
final.add_column(" ",column_list)
final.add_column("Coefficients",beta)
final.add_column("Standard Error",SE)
final.add_column("t-stat",tstat)
print(final)
print("Residual: ",residual)
print("Total: ",total)
print("Standard Error: ",sigma)
print("R Square: ",Rsq)
After running the above code, by calling the function given below,
>>> c
['wage', 'educ', 'exper', 'tenure']
>>> import LR_OLS as inf
>>> inf.LinearRegressionOLS(file[c],"wage")
, i get some error like this
Traceback (most recent call last):
File "<pyshell#182>", line 1, in <module>
inf.LinearRegressionOLS(file[c],"wage")
File "E:\python\LR_OLS.py", line 29, in LinearRegressionOLS
File "C:\Program Files\Python35\lib\site-packages\pandas\core\frame.py", line 2133, in __getitem__
return self._getitem_array(key)
File "C:\Program Files\Python35\lib\site-packages\pandas\core\frame.py", line 2177, in _getitem_array
indexer = self.loc._convert_to_indexer(key, axis=1)
File "C:\Program Files\Python35\lib\site-packages\pandas\core\indexing.py", line 1269, in _convert_to_indexer
.format(mask=objarr[mask]))
KeyError: "['wage'] not in index"
Can anyone help me as to why i am getting this error. How can i resolve it?
The problem is that you still have 'wage' in 'column_list. So in order to never let it get in there do the following adaptation:
for i in data.columns:
if i != 'wage': # add this line to your code
column_list.append(i)

error opening xlsx files in python

I am trying to open an xlsx file that is created by another system (and this is the format in which the data always comes, and is not in my control). I tried both openpyxl (v2.3.2) and xlrd (v1.0.0) (as well as pandas (v0.20.1) read_excel and pd.ExcelFile(), both of which are using xlrd, and so may be moot), and I am running into errors; plus not finding answers from my searches. Any help is appreciated.
xlrd code:
import xlrd
workbook = xlrd.open_workbook(r'C:/Temp/Data.xlsx')
Error:
Traceback (most recent call last):
File "<ipython-input-3-9e5d87f720d0>", line 2, in <module>
workbook = xlrd.open_workbook(r'C:/Temp/Data.xlsx')
File "C:\Program Files\Anaconda3\lib\site-packages\xlrd\__init__.py", line 422, in open_workbook
ragged_rows=ragged_rows,
File "C:\Program Files\Anaconda3\lib\site-packages\xlrd\xlsx.py", line 833, in open_workbook_2007_xml
x12sheet.process_stream(zflo, heading)
File "C:\Program Files\Anaconda3\lib\site-packages\xlrd\xlsx.py", line 548, in own_process_stream
self_do_row(elem)
File "C:\Program Files\Anaconda3\lib\site-packages\xlrd\xlsx.py", line 685, in do_row
self.sheet.put_cell(rowx, colx, None, float(tvalue), xf_index)
ValueError: could not convert string to float:
openpyxl code:
import openpyxl
wb = openpyxl.load_workbook(r'C:/Temp/Data.xlsx')
Error:
Traceback (most recent call last):
File "<ipython-input-2-6083ad2bc875>", line 1, in <module>
wb = openpyxl.load_workbook(r'C:/Temp/Data.xlsx')
File "C:\Program Files\Anaconda3\lib\site-packages\openpyxl\reader\excel.py", line 234, in load_workbook
parser.parse()
File "C:\Program Files\Anaconda3\lib\site-packages\openpyxl\reader\worksheet.py", line 106, in parse
dispatcher[tag_name](element)
File "C:\Program Files\Anaconda3\lib\site-packages\openpyxl\reader\worksheet.py", line 243, in parse_row_dimensions
self.parse_cell(cell)
File "C:\Program Files\Anaconda3\lib\site-packages\openpyxl\reader\worksheet.py", line 188, in parse_cell
value = _cast_number(value)
File "C:\Program Files\Anaconda3\lib\site-packages\openpyxl\cell\read_only.py", line 23, in _cast_number
return long(value)
ValueError: invalid literal for int() with base 10: ' '
pandas code:
import pandas as pd
df = pd.read_excel(r'C:/Temp/Data.xlsx', sheetname='Sheet1')
Error:
Traceback (most recent call last):
File "<ipython-input-5-b86ec98a4e9e>", line 2, in <module>
df = pd.read_excel(r'C:/Temp/Data.xlsx', sheetname='Sheet1')
File "C:\Program Files\Anaconda3\lib\site-packages\pandas\io\excel.py", line 200, in read_excel
io = ExcelFile(io, engine=engine)
File "C:\Program Files\Anaconda3\lib\site-packages\pandas\io\excel.py", line 257, in __init__
self.book = xlrd.open_workbook(io)
File "C:\Program Files\Anaconda3\lib\site-packages\xlrd\__init__.py", line 422, in open_workbook
ragged_rows=ragged_rows,
File "C:\Program Files\Anaconda3\lib\site-packages\xlrd\xlsx.py", line 833, in open_workbook_2007_xml
x12sheet.process_stream(zflo, heading)
File "C:\Program Files\Anaconda3\lib\site-packages\xlrd\xlsx.py", line 548, in own_process_stream
self_do_row(elem)
File "C:\Program Files\Anaconda3\lib\site-packages\xlrd\xlsx.py", line 685, in do_row
self.sheet.put_cell(rowx, colx, None, float(tvalue), xf_index)
ValueError: could not convert string to float:
For what its worth, here is an example snippet of the input file:
I am guessing that the errors are coming from the first row having blanks beyond the first column - because the errors vanish when I delete the first two rows and . I cannot skip the first two rows, because I want to extract the value in cell A1. I would also like to force the values read to be string type, and will later convert to float with error checking. thanks!
===========
Update(Aug 9 10AM EDT): Using Charlie's suggestion, was able to open excel file in read only mode; and was able to read most of the contents - but still running into an error somewhere.
new code (sorry it is not very pythonic - still a newbie):
wb = openpyxl.load_workbook(r'C:/Temp/Data.xlsx', read_only=True)
ws = wb['Sheet1']
ws.max_row = ws.max_column = None
i=1
for row in ws.rows:
for cell in row:
if i<2000:
i += 1
try:
print(i, cell.value)
except:
print("error")
Error:
Traceback (most recent call last):
File "<ipython-input-65-2e8f3cf2294a>", line 2, in <module>
for row in ws.rows:
File "C:\Program Files\Anaconda3\lib\site-packages\openpyxl\worksheet\read_only.py", line 125, in get_squared_range
yield tuple(self._get_row(element, min_col, max_col))
File "C:\Program Files\Anaconda3\lib\site-packages\openpyxl\worksheet\read_only.py", line 165, in _get_row
value, data_type, style_id)
File "C:\Program Files\Anaconda3\lib\site-packages\openpyxl\cell\read_only.py", line 36, in __init__
self.value = value
File "C:\Program Files\Anaconda3\lib\site-packages\openpyxl\cell\read_only.py", line 132, in value
value = _cast_number(value)
File "C:\Program Files\Anaconda3\lib\site-packages\openpyxl\cell\read_only.py", line 23, in _cast_number
return long(value)
ValueError: invalid literal for int() with base 10: ' '
=========
Update2 (10:35AM): when i read the file without ws.max_row and ws.max_column set as None, the code was reading just one column, without errors. The value in cell A66 is "Generated from:". But when i read the file with ws.max_row and ws.max_column set as None, this particular cell is causing trouble. But I can read all other cells before that, and that will work fine for me, right now. thanks, #Charlie.
Sounds like the source file is probably corrupt and contains cells that with empty strings that are typed as numbers. You might be able to use openpyxl's read-only mode to skip the first tow rows.
If your program works after you delete the first two rows then lets skip them. try use skiprows to ignore the first 2 rows that are blanks or are headers. you can use the parse method from panda.
xls = pd.read_excel('C:/Temp/Data.xlsx')
df = xls.parse('Sheet1', skiprows=2) #assuming your data is on sheet1.

K means clustering using weka python

from weka.clusterers import Clusterer
import weka.core.converters as converters
data = converters.load_any_file("/home/ubuntu/test.csv")
data.class_is_last()
clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans", options=["-N", "3"])
clusterer.build_clusterer(data)
print(clusterer)
# cluster the data
for inst in data:
cl = clusterer.cluster_instance(inst) # 0-based cluster index
dist = clusterer.distribution_for_instance(inst) # cluster membership distribution
print("cluster=" + str(cl) + ", distribution=" + str(dist))
I used the above code for doing k means custering i am not able to execute the program
The following are the errors I get
Traceback (most recent call last):
File "clus.py", line 6, in <module>
data = converters.load_any_file("/home/ubuntu/hello.csv")
File "/usr/local/lib/python2.7/dist-packages/weka/core/converters.py", line 255, in load_any_file
loader = loader_for_file(filename)
File "/usr/local/lib/python2.7/dist-packages/weka/core/converters.py", line 239, in loader_for_file
"(Ljava/lang/String;)Lweka/core/converters/AbstractFileLoader;", filename)
File "/usr/local/lib/python2.7/dist-packages/javabridge/jutil.py", line 932, in static_call
fn = make_static_call(class_name, method_name, sig)
File "/usr/local/lib/python2.7/dist-packages/javabridge/jutil.py", line 903, in make_static_call
klass = env.find_class(class_name)
AttributeError: 'NoneType' object has no attribute 'find_class'
I don't know why I am getting these errors. Can someone help me with this?
As described in the python-weka-wrapper API you have to import and start the Java Virtual Machine:
import weka.core.jvm as jvm
jvm.start()
It should solve your problem.

Categories