read Chinese character from excel file python3 - python

I have an Excel file that contains two columns, first one in Chinese and the second is just a link.
I tried two methods I found here. but it didn't work and I can't print the value in the console, I changed my encoding variable in settings (pycharm) to U8, still doesn't work.
I used Pandas & xlrd libs, both didn't work while it worked for others who posted.
this is my current code :
from xlrd import open_workbook
class Arm(object):
def __init__(self, id, dsp_name):
self.id = id
self.dsp_name = dsp_name
def __str__(self):
return("Arm object:\n"
" Arm_id = {0}\n"
" DSPName = {1}\n"
.format(self.id, self.dsp_name))
if __name__ == '__main__':
wb = open_workbook('test.xls')
for sheet in wb.sheets():
print(sheet)
number_of_rows = sheet.nrows
number_of_columns = sheet.ncols
items = []
rows = []
for row in range(1, number_of_rows):
values = []
for col in range(number_of_columns):
value = str(sheet.cell(row, col).value)
for a in value:
print('\n'.join([a]))
values.append(value)
print(value)
for item in items:
print (item)
print("Accessing one single value (eg. DSPName): {0}".format(item.dsp_name))
print
obviously it's not working, I was just messing around with it after giving up.
File : http://www59.zippyshare.com/v/UxITFjis/file.html

It's not about encoding, you are not access the right rows.
On the line 24
for row in range(1, number_of_rows):
why are you want to start with 1 instead of 0.
tryfor row in range(number_of_rows):

Well the problem I had wasn't in reading the Chinese characters actually! my problem we're in printing in console.
I thought that the print encoder works fine and I just didn't read it the characters, but this code works fine :
from xlrd import open_workbook
wb = open_workbook('test.xls')
messages = []
links = []
for sheet in wb.sheets():
number_of_rows = sheet.nrows
number_of_columns = sheet.ncols
for row in range(1, number_of_rows):
i = 0
for col in range(number_of_columns):
value = (sheet.cell(row,col).value).encode('gbk')
if i ==0:
messages.append(value)
else:
links.append(value)
i+=1
print(links)
to check it, I paste the first result in selenium driver (since I was going to use it anyway)
element = driver.find_element_by_class_name('email').send_keys(str(messages[0],'gbk'))
and it works like a charme!

Related

How to loop through rows of the Excel sheet using openpyxl?

I am using Python, Selenium, openpyxl in order to fill a form online.
To fill the form I am taking values from specific cells on excel (.xlsx).
(to test the code you can just create and excel file with 2 columns, insert under column A some names and column B some age.
From the cell A2, I take the NAME of the person and insert it into the online form
From the cell B2, I take the LASTNAME of the person and insert it into the online form
Then I click 'Reset' (This is an example but in the real code I will click save as a draft).
I would like to create a loop in which the code will start again from driver.get("https://www.roboform.com/filling-test-all-fields") to go again to the page where I need to fill out the form, but this time I would like to take:
From the cell A3, the NAME of the person and insert it into the online form
From the cell B3, the LASTNAME of the person and insert it into the online form
And click 'Send as a draft' again
Then again, another loop to insert the data from row 4, so I would like to program to read again my code from driver.get("https://www.roboform.com/filling-test-all-fields") but this time take values from A4 and B4, and so on, until the row on excel is empty.
With the following code I can insert the data to the online form:
from selenium import webdriver
from selenium.webdriver.chrome.webdriver import WebDriver
from selenium.common.exceptions import NoSuchElementException
import openpyxl
driver: WebDriver =
webdriver.Chrome("/Users/HHHHH/PycharmProjects/excel/driver/chromedriver")
driver.maximize_window()
excel_document = openpyxl.load_workbook(r"/Users/XPATH OF THE EXCEL FILE YOU CREATE TO TEST THIS CODE",
data_only=True)
sheet = excel_document["Sheet1"]
driver.get("https://www.roboform.com/filling-test-all-fields")
#Insert in the form the Name of the person
prevsymbol = sheet["A2"].value
if prevsymbol == None:
pass
else:
try:
driver.find_element_by_name("02frstname").send_keys(sheet["A2"].value)
except NoSuchElementException:
print("A2:(name) Not Found")
#Insert in the form the Last Name of the person
prevsymbol = sheet["B2"].value
if prevsymbol == None:
pass
else:
try:
driver.find_element_by_name("04lastname").send_keys(sheet["B2"].value)
except NoSuchElementException:
print("B2:(Lastname) Not Found")
#click Save as a draft
driver.find_element_by_xpath("//*[#value='Reset']").click()
I have create a helper class please find if it fulfill your purpose. This code is done in old version of openpyxl. Please update code if needed.
class OpenpyxlImport(object):
def __init__(self, file):
self.file = file
if self.file.name.endswith('.xls'):
self.wb = self.xls_to_xlsx(self.file)
else:
self.wb = load_workbook(self.file)
self.sheets = self.wb.worksheets
def to_camelcase(self, string):
text = re.sub(r'(?!^)_([a-zA-Z])', lambda m: ' ' + m.group(1).upper(), str(string))
return text.upper()
def to_snake_case(self, string):
text = re.sub(r'\s', '_', str(string))
return text.lower()
def xls_to_xlsx(self, content):
xls_book = xlrd.open_workbook(file_contents=content.read())
workbook = openpyxlWorkbook()
for i in range(0, xls_book.nsheets):
xls_sheet = xls_book.sheet_by_index(i)
sheet = workbook.active if i == 0 else workbook.create_sheet()
sheet.title = xls_sheet.name
for row in range(0, xls_sheet.nrows):
for col in range(0, xls_sheet.ncols):
sheet.cell(row=row + 1, column=col + 1).value = xls_sheet.cell_value(row, col)
return workbook
def tally_header(self, row, fields):
# Strip whitespace in cell value
for cell in row:
cell.value = cell.value.rstrip()
return [cell.value for cell in row] == fields
def row_to_dict(self, row):
dct = {}
for cell in row:
dct[self.to_snake_case(self.get_first_sheet()[cell.column + '1'].value)] = cell.value
return dct
def get_sheets(self):
return self.sheets
def get_first_sheet(self):
return self.sheets[0]
def get_sheet_rows(self):
return tuple(self.get_first_sheet().iter_rows())
# Usage
excel = OpenpyxlImport(file)
rows = excel.get_sheet_rows()
if excel.tally_header(rows[0], self.fields):
for row in rows[1:]:
params = excel.row_to_dict(row)
You can get the number of rows in the sheet using the max_row property. So, the code becomes:
from selenium import webdriver
from selenium.webdriver.chrome.webdriver import WebDriver
from selenium.common.exceptions import NoSuchElementException
import openpyxl
driver: WebDriver =
webdriver.Chrome("/Users/HHHHH/PycharmProjects/excel/driver/chromedriver")
driver.maximize_window()
excel_document = openpyxl.load_workbook(r"/Users/HHHHH/Desktop/testtesttest1.xlsx",
data_only=True)
sheet = excel_document["Sheet1"]
for i in range(1, sheet.max_row+1):
driver.get("https://XXXXXXXXXX")
# Insert in the form the Name of the person
cell = "A" + str(i)
prevsymbol = sheet[cell].value
# Note that instead of doing the work at the else clause, you can negate the term
if prevsymbol is not None:
try:
# Note that we can use prevsymbol here, instead of referring to cell once again
driver.find_element_by_id("name").send_keys(prevsymbol)
except NoSuchElementException:
#
print(cell + ":(name) Not Found")
# Insert in the form the Age of the person
cell = "B" + str(i)
prevsymbol = sheet[cell].value
if prevsymbol is not None:
try:
driver.find_element_by_id("age").send_keys(prevsymbol)
except NoSuchElementException:
print(cell + ":(Age) Not Found")
# Click Save as a draft
driver.find_element_by_xpath("xpath_save_as_draft").click()

AttributeError: 'pywintypes.datetime' object has no attribute 'nanosecond'

I have some code to open an excel file and save it as a pandas dataframe, it was originally used in Python 2.7 and I am currently trying to make it work under Python 3.
Originally, I used the code in #myidealab from this other post: From password-protected Excel file to pandas DataFrame.
It currently looks like this:
data_file = <path_for_file>
# Load excel file
xlApp = win32com.client.Dispatch("Excel.Application")
xlApp.Visible = False
pswd = getpass.getpass('password: ')
xldatabase = xlApp.Workbooks.Open(data_file, False, True, None, pswd)
dfdatabase = []
for sh in xldatabase.Sheets:
xlsheet = xldatabase.Worksheets(sh.Name)
# Get last_row
row_num = 0
cell_val = ''
while cell_val != None:
row_num += 1
cell_val = xlsheet.Cells(row_num, 1).Value
last_row = row_num - 1
# Get last_column
col_num = 0
cell_val = ''
while cell_val != None:
col_num += 1
cell_val = xlsheet.Cells(1, col_num).Value
last_col = col_num - 1
# Get content
content = xlsheet.Range(xlsheet.Cells(1, 1), xlsheet.Cells(last_row, last_col)).Value
# Load each sheet as a dataframe
dfdatabase.append(pd.DataFrame(list(content[1:]), columns=content[0]))
Now, I am getting the following error:
AttributeError: 'pywintypes.datetime' object has no attribute
'nanosecond'
The problem seems to boil down to the lines bellow:
# Get content
content = xlsheet.Range(xlsheet.Cells(1, 1), xlsheet.Cells(last_row, last_col)).Value
# Load each sheet as a dataframe
dfdatabase.append(pd.DataFrame(list(content[1:]), columns=content[0]))
The xlsheet.Range().Value is reading the data and assigning pywintymes descriptors to the data, which pd.DataFrame() fails to interpret.
Did anyone ran into this issue before? Is there a way that I can specifically tell xlsheet.Range().Value how to read the values in a way that pandas can interpret?
Any help will be welcome!
Thank you.
This solves the issue, assuming you know beforehand the size/formatting of your dates/times in the excel sheet.
Might be there are other more elegant ways to solve it, nonetheless.
Note: content is initially a tuple. Position [0] is the array containing the headers and the remaining positions contain the data.
import datetime
import pywintypes
...
content = xlsheet.Range(xlsheet.Cells(1, 1), xlsheet.Cells(last_row, last_col)).Value
head = content[0]
data = list(content[1:])
for x in range(0,len(data)):
data[x] = list(data[x])
for y in range(0,len(data[x])):
if isinstance(data[x][y], pywintypes.TimeType):
temp = str(data[x][y]).rstrip("+00:00").strip()
if len(temp)>10:
data[x][y] = datetime.datetime.strptime(temp, "%Y-%m-%d%H:%M")
elif len(temp)>5 and len(temp)<=10:
data[x][y] = datetime.datetime.strptime(temp, "%Y-%m-%d")
elif len(temp)<=5:
data[x][y] = datetime.datetime.strptime(temp, "%H:%M")
print(data[x][y])
# Load each sheet as a dataframe
dfdatabase.append(pd.DataFrame(data, columns=head))
Used this as references:
python-convert-pywintyptes-datetime-to-datetime-datetime

Python read XML file (near 50mb)

I'm parsing a XML String into CSV string but it's going very slow:
INDEX_COLUMN = "{urn:schemas-microsoft-com:office:spreadsheet}Index"
CELL_ELEMENT = "Cell"
DATA_ELEMENT = "Data"
def parse_to_csv_string(xml):
print('parse_to_csv_string')
csv = []
parsed_data = serialize_xml(xml)
rows = list(parsed_data[1][0])
header = get_cells_text(rows[0])
rows.pop(0)
csv.append(join(",", header))
for row in rows:
values = get_cells_text(row)
csv.append(join(",", values))
return join("\n", csv)
def serialize_xml(xml):
return ET.fromstring(xml)
def get_cells_text(row):
keys = []
cells = normalize_row_cells(row)
for elm in cells:
keys.append(elm[0].text or "")
while len(keys) < 92:
keys.append("")
return keys
def normalize_row_cells(row):
cells = list(row)
updated_cells = copy.deepcopy(cells)
pos = 1
for elm in cells:
strIndexAttr = elm.get(INDEX_COLUMN)
index = int(strIndexAttr) if strIndexAttr else pos
while index > pos:
empty_elm = ET.Element(CELL_ELEMENT)
child = ET.SubElement(empty_elm, DATA_ELEMENT)
child.text = ""
updated_cells.insert(pos - 1, empty_elm)
pos += 1
pos += 1
return updated_cells
The XML String sometimes miss a few columns and I need to iterate it to fill missing columns - every row must have 92 columns. That's why I have some helper functions to manipulate XML.
Right now I'm running my function with 4GB as Lambda and still getting timeout :(
Any idea on how to improve performance?
The normalize_row_cells constructs ElementTree Element instances but get_cells_text is only interested in each instance's child's text attribute, so I would consider changing normalize_row_cells to just return the text. Also, it's performing copies and calling list.insert: inserting elements into the middle of lists can be expensive, because each element after the insertion point must be moved.
Something like this (untested code) avoids making copies and insertions and returns only the required text, making get_cells_text redundant.
def normalize_row_cells(row):
cells = list(row)
updated_cells = []
pos = 1
for _ in range(0, 92):
elm = cells[pos - 1]
strIndexAttr = elm.get(INDEX_COLUMN)
index = int(strIndexAttr) if strIndexAttr else pos
if index == pos:
updated_cells.append(elm[0].text)
pos += 1
else:
update_cells.append("")
return updated_cells
If you can match your cells to their header names then using csv.DictWriter from the standard library might be even better (you need to profile to be sure).
import csv
import io
def parse_to_csv_string(xml):
print('parse_to_csv_string')
csv = []
parsed_data = serialize_xml(xml)
rows = list(parsed_data[1][0])
header = get_cells_text(rows[0])
with io.StringIO() as f:
writer = csv.DictWriter(f, fieldnames=header)
for row in rows:
row = get_cells_text(row)
writer.writerow(row)
f.seek(0)
data = f.read()
return data
def get_cells_text(row):
row_dict = {}
for cell in row:
column_name = get_column_name(cell) # <- can this be done?
row_dict[column_name] = elm[0].text or ""
return row_dict

web scraping and transferring data into excel using python

im able to fully scrap the material i needed the problem is i cant get the data into excel.
from lxml import html
import requests
import xlsxwriter
page = requests.get('website that gets mined')
tree = html.fromstring(page.content)
items = tree.xpath('//h4[#class="item-title"]/text()')
prices = tree.xpath('//span[#class="price"]/text()')
description = tree.xpath('//div[#class="description text"]/text()')
print 'items: ', items
print 'Prices: ', prices
print 'description', description
everything works fine until this section where i try to get the data into excel
this is the error message:
for items,prices,description in (array):
ValueError: too many values to unpack
Exception Exception: Exception('Exception caught in workbook destructor. Explicit close() may be required for workbook.',) in <bound method Workbook.__del__ of <xlsxwriter.workbook.Workbook object at 0x104735e10>> ignored
this is what it was trying to do
array = [items,prices,description]
workbook = xlsxwriter.Workbook('test1.xlsx')
worksheet = workbook.add_worksheet()
row = 0
col = 0
for items,prices,description in (array):
worksheet.write(row, col, items)
worksheet.write(row, col + 1, prices)
worksheet.write(row, col + 2, description)
row += 1
workbook.close()
Assuming that "items,prices,description" all have the same length, you could rewrite the final part of the code in :
for item,price,desc in zip(items,prices,description)
worksheet.write(row, col, item)
worksheet.write(row, col + 1, price)
worksheet.write(row, col + 2, desc)
row += 1
If the lists can have unequal lengths you should check this for alternatives for the zip method, but I would be worried for the data consistency.
Inevitably, it will be easier to write to a CSV file, or a Text file, rather than an Excel file.
import urllib2
listOfStocks = ["AAPL", "MSFT", "GOOG", "FB", "AMZN"]
urls = []
for company in listOfStocks:
urls.append('http://real-chart.finance.yahoo.com/table.csv?s=' + company + '&d=6&e=28&f=2015&g=m&a=11&b=12&c=1980&ignore=.csv')
Output_File = open('C:/your_path_here/Data.csv','w')
New_Format_Data = ''
for counter in range(0, len(urls)):
Original_Data = urllib2.urlopen(urls[counter]).read()
if counter == 0:
New_Format_Data = "Company," + urllib2.urlopen(urls[counter]).readline()
rows = Original_Data.splitlines(1)
for row in range(1, len(rows)):
New_Format_Data = New_Format_Data + listOfStocks[counter] + ',' + rows[row]
Output_File.write(New_Format_Data)
Output_File.close()
OR
from bs4 import BeautifulSoup
import urllib2
var_file = urllib2.urlopen("http://www.imdb.com/chart/top")
var_html = var_file.read()
text_file = open("C:/your_path_here/Text1.txt", "wb")
var_file.close()
soup = BeautifulSoup(var_html)
for item in soup.find_all(class_='lister-list'):
for link in item.find_all('a'):
#print(link)
z = str(link)
text_file.write(z + "\r\n")
text_file.close()
As a developer, it's difficult to programmatically manipulate Excel files since the Excel is proprietary. This is especially true for languages other than .NET. On the other hand, for a developer it's easy to programmatically manipulate CSV since, after all, they are simple text files.

How to extract column and row in csv using python

I have this input in a file.csv
"","min","max","rainfall","days_clear"
"Missouri",-2,10,300,23
"Amsterdam",-3,5,1212,34
"LA",10,20,1000,54
I wanted to write a simple program to find the city with the lowest rainfall which is Missouri in this case. How can I do that using Python csv reader?
I can try extract the items but unfortunately the first row of the file has to be there.
I wanted to have something like count[Missouri]=300
count[Amsterdam]=1212 etc.. so that I can do a minimum and reference back to print the city.
Please advise. Thanks.
import csv
def main():
with open('file.csv', 'rb') as inf:
data = [(int(row['rainfall']), row['']) for row in csv.DictReader(inf)]
data.sort()
print data[0]
if __name__=="__main__":
main()
returns
(300, 'Missouri')
One way to do this would be to use the csv module's DictReader class to write a function to extract the column of data. DictReader will take care of handling the first row of field names automatically. The built-in min() function can then be used to determine the item with the smallest value in the column.
import csv
def csv_extract_col(csvinput, colname, key):
""" extract a named column from a csv stream into a dictionary
colname: name of columm to extract
key: name of another columm to use as keys in returned dict
"""
col = {}
for row in csv.DictReader(csvinput):
col[row[key]] = row[colname]
return col
if __name__=='__main__':
import StringIO
csvdata = """\
"","min","max","rainfall","days_clear" # field name row
"Missouri",-2,10,300,23
"Amsterdam",-3,5,1212,34
"LA",10,20,1000,54
"""
csvfile = StringIO.StringIO(csvdata)
rainfall = csv_extract_col(csvfile, 'rainfall', '')
print rainfall
# {'Amsterdam': '1212', 'LA': '1000', 'Missouri': '300'}
print min(rainfall.iteritems(), key=lambda r: float(r[1]))
# ('Missouri', '300')
import StringIO
import csv
example = """"","min","max","rainfall","days_clear"
"Missouri",-2,10,300,23
"Amsterdam",-3,5,1212,34
"LA",10,20,1000,54
"""
data_in = StringIO.StringIO(example)
#data_in = open('mycsvdata.csv')
def read_data(data_in):
reader = csv.reader(data_in)
cols = []
results = {}
for row in reader:
if not cols:
cols = row
continue
row = [ int(x) if x.lstrip('-').isdigit() else x for x in row ]
results[row[0]] = dict(zip(cols[1:],row[1:]))
return results
data = read_data(data_in)
min(data.items(),key=lambda x: x[1].get('rainfall'))
Returns
('Missouri', {'max': 10, 'days_clear': 23, 'rainfall': 300, 'min': -2})
To read from a file, you need to remove all code that deals with a string:
reader = csv.reader(open('file.csv', 'rb'))
rainfall = csv_extract_col(reader, 'rainfall', '')
Update: Sorry, it neads a bit more work than that. The first arg of csv_extract_col will be used as the first arg of csv.DictReader so (in this case) it should be an open file object, and should never be a csv.reader instance. See below:
import csv
### def csv_extract_col(csvinput, colname, key):
### exactly as provided by #martineau
if __name__ == '__main__':
import sys
filename, data_col_name, key_col_name = sys.argv[1:4]
input_file_object = open(filename, 'rb')
result_dict = csv_extract_col(input_file_object, data_col_name, key_col_name)
print result_dict
print min(result_dict.iteritems(), key=lambda r: float(r[1]))
Results:
command-prompt>\python27\python joj_csv.py joj.csv rainfall ""
{'Amsterdam': '1212', 'LA': '1000', 'Missouri': '300'}
('Missouri', '300')
command-prompt>\python27\python joj_csv.py joj.csv days_clear ""
{'Amsterdam': '34', 'LA': '54', 'Missouri': '23'}
('Missouri', '23')
Update 2 in response to comment """there must be something i missed out.. i tried.. [what looks like #martineau's function] with the above main function you define. Then in my shell, i define python rainfall "". But it gives me KeyError: 'rainfall'"""
Two possibilities:
(1) You made a mistake patching the pieces of source code together. Check your work.
(2) Your file doesn't have the expected heading row contents. Try some debugging e.g. change #martineau's code so that you can insert a print statement etc. to show what the csv.DictReader thinks about your heading row:
reader = csv.DictReader(csvinput)
print "fieldnames", reader.fieldnames
assert colname in reader.fieldnames
assert key in reader.fieldnames
for row in reader:
If you are still stuck, show us ALL of your code plus the full traceback and error message -- either edit your question or put it up on pastbin or dropbox; DON'T put it into a comment!!
My code for cases in which there are several cities having the same minimum or several cities having the same maximum:
import csv
def minmax_col(filename,key,colname):
with open(filename,'rb') as csvfile:
rid = csv.DictReader(csvfile,
fieldnames=None,
quoting=csv.QUOTE_NONNUMERIC)
mini = float('inf')
maxi = float('-inf')
limin = limax =[]
for row in rid:
if row[colname] == maxi:
limax.append(row[key])
elif row[colname] > maxi:
maxi = row[colname]
limax = [row[key]]
if row[colname] == mini:
limin.append(row[key])
elif row[colname] < mini:
mini = row[colname]
limin = [row[key]]
return (key,(maxi,limax),(mini,limin))
key = 'rainfall'
city,(Ma,liMa),(mi,limi) = minmax_col('filename.csv','',key)
print 'Cities analysed on ' + repr(key) + ' parameter :'
print 'maximum==',Ma,' cities :',', '.join(liMa)
print 'minimum==',mi,' cities :',', '.join(limi)
print
key = 'min'
city,(Ma,liMa),(mi,limi) = minmax_col('filename.csv','',key)
print 'Cities analysed on ' + repr(key) + ' parameter :'
print 'maximum==',Ma,' cities :',', '.join(liMa)
print 'minimum==',mi,' cities :',', '.join(limi)
On a file like that:
"","min","max","rainfall","days_clear"
"Missouri",-2,10,300,23
"Amsterdam",-3,5,1212,34
"Oslo",-2,8,800,12
"LA",10,20,1000,54
"Kologoro",28,45,1212,1
the result is
Cities analysed according the 'rainfall' parameter :
maximum== 1212.0 cities : Amsterdam, Kologoro
minimum== 300.0 cities : Missouri
Cities analysed according the 'min' parameter :
maximum== 28.0 cities : Kologoro
minimum== -3.0 cities : Amsterdam

Categories