web scraping and transferring data into excel using python - python

im able to fully scrap the material i needed the problem is i cant get the data into excel.
from lxml import html
import requests
import xlsxwriter
page = requests.get('website that gets mined')
tree = html.fromstring(page.content)
items = tree.xpath('//h4[#class="item-title"]/text()')
prices = tree.xpath('//span[#class="price"]/text()')
description = tree.xpath('//div[#class="description text"]/text()')
print 'items: ', items
print 'Prices: ', prices
print 'description', description
everything works fine until this section where i try to get the data into excel
this is the error message:
for items,prices,description in (array):
ValueError: too many values to unpack
Exception Exception: Exception('Exception caught in workbook destructor. Explicit close() may be required for workbook.',) in <bound method Workbook.__del__ of <xlsxwriter.workbook.Workbook object at 0x104735e10>> ignored
this is what it was trying to do
array = [items,prices,description]
workbook = xlsxwriter.Workbook('test1.xlsx')
worksheet = workbook.add_worksheet()
row = 0
col = 0
for items,prices,description in (array):
worksheet.write(row, col, items)
worksheet.write(row, col + 1, prices)
worksheet.write(row, col + 2, description)
row += 1
workbook.close()

Assuming that "items,prices,description" all have the same length, you could rewrite the final part of the code in :
for item,price,desc in zip(items,prices,description)
worksheet.write(row, col, item)
worksheet.write(row, col + 1, price)
worksheet.write(row, col + 2, desc)
row += 1
If the lists can have unequal lengths you should check this for alternatives for the zip method, but I would be worried for the data consistency.

Inevitably, it will be easier to write to a CSV file, or a Text file, rather than an Excel file.
import urllib2
listOfStocks = ["AAPL", "MSFT", "GOOG", "FB", "AMZN"]
urls = []
for company in listOfStocks:
urls.append('http://real-chart.finance.yahoo.com/table.csv?s=' + company + '&d=6&e=28&f=2015&g=m&a=11&b=12&c=1980&ignore=.csv')
Output_File = open('C:/your_path_here/Data.csv','w')
New_Format_Data = ''
for counter in range(0, len(urls)):
Original_Data = urllib2.urlopen(urls[counter]).read()
if counter == 0:
New_Format_Data = "Company," + urllib2.urlopen(urls[counter]).readline()
rows = Original_Data.splitlines(1)
for row in range(1, len(rows)):
New_Format_Data = New_Format_Data + listOfStocks[counter] + ',' + rows[row]
Output_File.write(New_Format_Data)
Output_File.close()
OR
from bs4 import BeautifulSoup
import urllib2
var_file = urllib2.urlopen("http://www.imdb.com/chart/top")
var_html = var_file.read()
text_file = open("C:/your_path_here/Text1.txt", "wb")
var_file.close()
soup = BeautifulSoup(var_html)
for item in soup.find_all(class_='lister-list'):
for link in item.find_all('a'):
#print(link)
z = str(link)
text_file.write(z + "\r\n")
text_file.close()
As a developer, it's difficult to programmatically manipulate Excel files since the Excel is proprietary. This is especially true for languages other than .NET. On the other hand, for a developer it's easy to programmatically manipulate CSV since, after all, they are simple text files.

Related

AttributeError: 'pywintypes.datetime' object has no attribute 'nanosecond'

I have some code to open an excel file and save it as a pandas dataframe, it was originally used in Python 2.7 and I am currently trying to make it work under Python 3.
Originally, I used the code in #myidealab from this other post: From password-protected Excel file to pandas DataFrame.
It currently looks like this:
data_file = <path_for_file>
# Load excel file
xlApp = win32com.client.Dispatch("Excel.Application")
xlApp.Visible = False
pswd = getpass.getpass('password: ')
xldatabase = xlApp.Workbooks.Open(data_file, False, True, None, pswd)
dfdatabase = []
for sh in xldatabase.Sheets:
xlsheet = xldatabase.Worksheets(sh.Name)
# Get last_row
row_num = 0
cell_val = ''
while cell_val != None:
row_num += 1
cell_val = xlsheet.Cells(row_num, 1).Value
last_row = row_num - 1
# Get last_column
col_num = 0
cell_val = ''
while cell_val != None:
col_num += 1
cell_val = xlsheet.Cells(1, col_num).Value
last_col = col_num - 1
# Get content
content = xlsheet.Range(xlsheet.Cells(1, 1), xlsheet.Cells(last_row, last_col)).Value
# Load each sheet as a dataframe
dfdatabase.append(pd.DataFrame(list(content[1:]), columns=content[0]))
Now, I am getting the following error:
AttributeError: 'pywintypes.datetime' object has no attribute
'nanosecond'
The problem seems to boil down to the lines bellow:
# Get content
content = xlsheet.Range(xlsheet.Cells(1, 1), xlsheet.Cells(last_row, last_col)).Value
# Load each sheet as a dataframe
dfdatabase.append(pd.DataFrame(list(content[1:]), columns=content[0]))
The xlsheet.Range().Value is reading the data and assigning pywintymes descriptors to the data, which pd.DataFrame() fails to interpret.
Did anyone ran into this issue before? Is there a way that I can specifically tell xlsheet.Range().Value how to read the values in a way that pandas can interpret?
Any help will be welcome!
Thank you.
This solves the issue, assuming you know beforehand the size/formatting of your dates/times in the excel sheet.
Might be there are other more elegant ways to solve it, nonetheless.
Note: content is initially a tuple. Position [0] is the array containing the headers and the remaining positions contain the data.
import datetime
import pywintypes
...
content = xlsheet.Range(xlsheet.Cells(1, 1), xlsheet.Cells(last_row, last_col)).Value
head = content[0]
data = list(content[1:])
for x in range(0,len(data)):
data[x] = list(data[x])
for y in range(0,len(data[x])):
if isinstance(data[x][y], pywintypes.TimeType):
temp = str(data[x][y]).rstrip("+00:00").strip()
if len(temp)>10:
data[x][y] = datetime.datetime.strptime(temp, "%Y-%m-%d%H:%M")
elif len(temp)>5 and len(temp)<=10:
data[x][y] = datetime.datetime.strptime(temp, "%Y-%m-%d")
elif len(temp)<=5:
data[x][y] = datetime.datetime.strptime(temp, "%H:%M")
print(data[x][y])
# Load each sheet as a dataframe
dfdatabase.append(pd.DataFrame(data, columns=head))
Used this as references:
python-convert-pywintyptes-datetime-to-datetime-datetime

Unable to concatenate 2 lists using Python (Int and text lists) and to split output data into separate files

I am facing issue with concatenating data of 2 lists. I have page number (integer value) and table data (text value) both within a list while merging them to print output I am getting this error:
TypeError: object of type 'int' has no len()
My goal is to print following output and also need help to save data from table 3 based on "Aircraft operator ID" keyword into table name "Table 3 A" and data with Header value of "Installation ID" into table name "Table 3 I" into 2 separate tabs of excel file. My code is given below:
import time
import requests
import random
from lxml import html # used to use Xpath
from bs4 import BeautifulSoup
import xlsxwriter
def append_row(ws, row):
for col, value in enumerate(row):
ws.write_string(ws.cur_row, col, value)
ws.cur_row += 1
workbook = xlsxwriter.Workbook('Output.xlsx')
ws_3_A = workbook.add_worksheet("Table 3 A")
ws_3_I = workbook.add_worksheet("Table 3 I")
# Keep a track of the row to use in each worksheet
ws_3_A.cur_row = 0
ws_3_I.cur_row = 0
# Code starts from here:
start = 1
end = 3
link = "http://ec.europa.eu/environment/ets/ohaDetails.do?returnURL=&languageCode=en&accountID=&registryCode=&buttonAction=all&action=&account.registryCode=&accountType=&identifierInReg=&accountHolder=&primaryAuthRep=&installationIdentifier=&installationName=&accountStatus=&permitIdentifier=&complianceStatus=&mainActivityType=-1&searchType=oha&resultList.currentPageNumber={}&nextList=Next%C2%A0%3E&selectedPeriods="
for page_number in range(start, end):
print("Page {}".format(page_number))
url = link.format(page_number)
r = requests.get(url)
print(url)
serial_no = [int(x) for x in str(page_number)]
print(serial_no)
time.sleep(random.randint(2, 5))
soup = BeautifulSoup(r.content, "lxml")
# Table 3 Aircraft Operator ID data:
for items in soup.find(id="tblChildDetails").find("table").find_all("tr")[1:]:
dataset = [item.get_text(strip=True) for item in items.find_all("td")[:]]
print(dataset)
append_row(ws_3_A, serial_no + [url] + dataset)
# Table 3 Installation ID data:
for items in soup.find(id="tblChildDetails").find("table").find_all("tr")[1:]:
dataset = [item.get_text(strip=True) for item in items.find_all("td")[:]]
print(dataset)
append_row(ws_3_I, serial_no + [url] + dataset)
workbook.close()
Current and Expected output is attached as screenshot. Expected output:
Traceback:
Traceback (most recent call last):
File "D:\QRS\Script.py", line 42, in <module>
append_row(ws_3_A, serial_no + [url] + dataset)
File "D:\QRS\Script.py", line 10, in append_row
ws.write_string(ws.cur_row, col, value)
File "C:\Users\varun\AppData\Roaming\Python\Python36\site-packages\xlsxwriter\worksheet.py", line 67, in cell_wrapper
return method(self, *args, **kwargs)
File "C:\Users\varun\AppData\Roaming\Python\Python36\site-packages\xlsxwriter\worksheet.py", line 500, in write_string
return self._write_string(row, col, string, cell_format)
File "C:\Users\varun\AppData\Roaming\Python\Python36\site-packages\xlsxwriter\worksheet.py", line 512, in _write_string
if len(string) > self.xls_strmax:
TypeError: object of type 'int' has no len()
Each element in [int(x) for x in str(page_number)] should be string. Because each element will pass to variable value in function append_row().
And then , you want to have an expected output , you need to ignore the first tr when page_number bigger start.
Use try...finally , so you can close workbook when script occured error
import time
import requests
import random
from lxml import html
from bs4 import BeautifulSoup
import xlsxwriter
def append_row(ws, row):
for col, value in enumerate(row):
ws.write_string(ws.cur_row, col, value)
ws.cur_row += 1
workbook = xlsxwriter.Workbook('Output.xlsx')
def ws_3(name):
return workbook.add_worksheet("Table 3 {}".format(name))
# Code starts from here:
start = 1
end = 5
link = "http://ec.europa.eu/environment/ets/ohaDetails.do?returnURL=&languageCode=en&accountID=&registryCode=&buttonAction=all&action=&account.registryCode=&accountType=&identifierInReg=&accountHolder=&primaryAuthRep=&installationIdentifier=&installationName=&accountStatus=&permitIdentifier=&complianceStatus=&mainActivityType=-1&searchType=oha&resultList.currentPageNumber={}&nextList=Next%C2%A0%3E&selectedPeriods="
coldict = {}
try:
for page_number in [1,2,3,342,343]:
print("Page {}".format(page_number))
url = link.format(page_number)
r = requests.get(url)
serial_no = [str(page_number)]
time.sleep(random.randint(2, 5))
soup = BeautifulSoup(r.content, "lxml")
# Table 3 Aircraft Operator ID data:
tr = soup.find(id="tblChildDetails").find("table").find_all("tr")
dataset = [item.get_text(strip=True) for item in tr[1].find_all("td")]
#select or create new table
if not coldict.get(dataset[0]):
ws = ws_3(dataset[0])
ws.cur_row = 0
coldict[dataset[0]] = ws
append_row(ws, ["Page no","Url"] + dataset)
else:
ws = coldict.get(dataset[0])
for items in tr[2:]:
dataset = [item.get_text(strip=True) for item in items.find_all("td")]
print(url)
print(dataset)
append_row(ws, serial_no + [url] + dataset)
finally:
workbook.close()

python csv write error while writing a float value as a column

I want to write the values fetched from url to csv file which has some float values too. The code below shows an error "float found."
import urllib2
import json
import csv
url = 'https://earthquake.usgs.gov/fdsnws/event/1/query?format=geojson&starttime=2016-10-01&endtime=2016-10-02'
i=0
csvfile = csv.writer(open('earthquakedet.csv', 'w'))
csvfile.writerow(["Latitude", "Longitude ","Title","Place","Mag"])
json_string = urllib2.urlopen(url).read()
j = json.loads(json_string)
names = [d['properties'] for d in j['features']]
names1 = [d['geometry'] for d in j['features']]
while i <= len(names):
print names[i]['title']
print names[i]['place']
print names[i]['mag']
print names1[i]['coordinates'][0]
print names1[i]['coordinates'][1]
i=i+1
finalstr=float(names1[i]['coordinates'][0]) + float(names1[i]['coordinates'][1]) + names[i]['title'] + names[i]['place'] + names[i]['mag']
csvfile.writerow(finalstr)
csvfile.close()
writerow takes a list of values to put on the row, not a string. So, instead of concatenating the values yourself, just put them in a list to pass to writerow:
# ...
i = i + 1
csvfile.writerow([names1[i]['coordinates'][0], names1[i]['coordinates'][1], names[i]['title'], names[i]['place'], names[i]['mag']])

Extract Header and Table text from a .docx file

I'm trying to extract page and header data from a docx file. The file is several hundred pages, each with a table and a header. The header has pertinent information that needs to be paired with each table. I'm able to extract the header and table data, I just can't reliably pair them together.
Using win32com this is what I've got so far
# getting the table page number
app = Dispatch("Word.Application")
doc = app.Documents.Open(filename)
table_1_page = doc.Tables(1).Range.Information(3) # 3 == wdActiveEndPageNumber
The problem occurs because the headers TextFrames and are duplicated on multiple pages, so when I call:
# getting the header page number
doc.Sections(1).Headers(1).Shapes(1).TextFrame.TextRange.Information(3)
I get one of the pages that the TextFrame occurs on. The page is chooses seems somewhat arbitrary, sometimes its the first others its the last, but its not predictable.
I'm spent a bit of time reading over the object model here. Ultimately it would be nice to capture all of the items displayed per page without reinventing the wheel.
EDIT 10/25/16 per request, here is some minimum working code**
# filename docx_parser.py
import pythoncom
class OpenDoc(object):
def __init__(self, docx_path):
import win32com.client as win32
self.path = docx_path
self.word = win32.Dispatch("Word.Application")
self.word.Visible = 0
self.word.Documents.Open(p)
self.doc = self.word.ActiveDocument
def get_table_count(self):
return self.doc.Tables.Count
def count_table_rows(self, table):
return table.Rows.Count
def count_table_columns(self, table):
return table.Columns.Count
def get_headers(self):
headers = self.doc.Sections(1).Headers(1)
shape_count = headers.Shapes.Count
for shape_num in range(1, shape_count + 1):
t_range = headers.Shapes(shape_num).TextFrame.TextRange
text = t_range.Text
page_num = t_range.Information(3) # 3 == wdActiveEndPageNumber
yield text, page_num
def get_table_text(self, table):
col_count = self.count_table_columns(table)
row_count = self.count_table_rows(table)
for row in range(1, row_count + 1):
row_data = []
for col in range(1, col_count + 1):
try:
row_data.append(table.Cell(Row=row, Column=col).Range.Text.strip(chr(7) + chr(13)))
except pythoncom.com_error as error:
row_data.append("")
yield row_data
def get_all_table_text(self):
for table in self.get_tables():
table_data = []
for row_data in self.get_table_text(table):
table_data.append(row_data)
yield table_data
def get_tables(self):
for table in self.doc.Tables:
yield table
def __del__(self):
self.word.Quit()
if __name__ == "__main__":
try:
path = r"sample.docx"
open_doc = OpenDoc(path)
for table_num, table_text in enumerate(open_doc.get_all_table_text()):
print("\n-------------- Table %s ----------------" % (table_num + 1))
for row_data in table_text:
print(", ".join(row_data))
for header_text, page_num in open_doc.get_headers():
print("header page number: %s, text: %s" % (page_num, header_text))
except Exception as error:
from traceback import format_exc
print(format_exc())
raw_input("")

read Chinese character from excel file python3

I have an Excel file that contains two columns, first one in Chinese and the second is just a link.
I tried two methods I found here. but it didn't work and I can't print the value in the console, I changed my encoding variable in settings (pycharm) to U8, still doesn't work.
I used Pandas & xlrd libs, both didn't work while it worked for others who posted.
this is my current code :
from xlrd import open_workbook
class Arm(object):
def __init__(self, id, dsp_name):
self.id = id
self.dsp_name = dsp_name
def __str__(self):
return("Arm object:\n"
" Arm_id = {0}\n"
" DSPName = {1}\n"
.format(self.id, self.dsp_name))
if __name__ == '__main__':
wb = open_workbook('test.xls')
for sheet in wb.sheets():
print(sheet)
number_of_rows = sheet.nrows
number_of_columns = sheet.ncols
items = []
rows = []
for row in range(1, number_of_rows):
values = []
for col in range(number_of_columns):
value = str(sheet.cell(row, col).value)
for a in value:
print('\n'.join([a]))
values.append(value)
print(value)
for item in items:
print (item)
print("Accessing one single value (eg. DSPName): {0}".format(item.dsp_name))
print
obviously it's not working, I was just messing around with it after giving up.
File : http://www59.zippyshare.com/v/UxITFjis/file.html
It's not about encoding, you are not access the right rows.
On the line 24
for row in range(1, number_of_rows):
why are you want to start with 1 instead of 0.
tryfor row in range(number_of_rows):
Well the problem I had wasn't in reading the Chinese characters actually! my problem we're in printing in console.
I thought that the print encoder works fine and I just didn't read it the characters, but this code works fine :
from xlrd import open_workbook
wb = open_workbook('test.xls')
messages = []
links = []
for sheet in wb.sheets():
number_of_rows = sheet.nrows
number_of_columns = sheet.ncols
for row in range(1, number_of_rows):
i = 0
for col in range(number_of_columns):
value = (sheet.cell(row,col).value).encode('gbk')
if i ==0:
messages.append(value)
else:
links.append(value)
i+=1
print(links)
to check it, I paste the first result in selenium driver (since I was going to use it anyway)
element = driver.find_element_by_class_name('email').send_keys(str(messages[0],'gbk'))
and it works like a charme!

Categories