Python: Creating AreaChart3D in a recursive way - python

My target is to create some AreaChart3D plots in an automatically way.
Precisely, for example I have the following picture:
This table is automatically outputed by a tool.
I can have only one graph, maybe 2 graphs or even 100 graphs (does not matter so much), it is important every time I will have this kind of behavior with Location, Speed, and some times inside.
Now, I would like to have in the second sheet(ws2_obj) 4 graphs or maybe 2 graphs depends how many graphs will be outputed by the tool.
If I would have had a fixed number of graph it would have been easier.
Because this graphs are not fixed i have to cover the entire sheet and I do not know how to do it.
Also, there is another question: how to handle Depth (% of base) using Python?
from openpyxl.chart import (
AreaChart3D,
Reference,
)
wb_obj = xl.load_workbook('Plots.xlsx')
ws_obj = wb_obj.active
ws2_obj = wb_obj.create_sheet("Graphs")
c1 = AreaChart3D()
c1.legend = None
c1.style = 15
cats = Reference(ws_obj, min_col=1, min_row=7, max_row=200)
data = Reference(ws_obj, min_col=2, min_row=6, max_col=8, max_row=200)
c1.add_data(data, titles_from_data=True)
c1.set_categories(cats)
ws2_obj.add_chart(c1, "A1")
wb_obj.save("Plots.xlsx")
The Code above produces only one graph, but how should I proceed to create 2 or 4 or 100 graphs?
Later edit 1:
I tried something like this and it is almost working:
for i in range(1, 4):
c1 = AreaChart3D()
cats = Reference(ws_obj, min_col=1, min_row=7, max_row=200)
data = Reference(ws_obj, min_col=2, min_row=6, max_col=i * int(step), max_row=200)
c1.title = ws_obj.cell(row=1, column=i * int(step)).value
c1.legend = None
c1.style = 15
c1.y_axis.title = 'Fire Time'
c1.x_axis.title = 'Temperature'
c1.z_axis.title = "Velocity"
c1.add_data(data, titles_from_data=True)
c1.set_categories(cats)
ws2_obj.add_chart(c1, "A2")
For me the last ws2_obj.add_chart(c1, "A2") seems to be the problematic one.
Instead of A2 I would like to use something like ws2_obj.add_chart(c1, cell(row=2, column=i)).value but does not working.
Later Edit 2
I have observed if you want to add a chart to a certain cell, you have to use something like: ws2_obj.add_chart(my_chart, "R2")
In order to use the for loop I tried to find out a way to get this value R2.
Please, see below:
my_cells = []
for i in range(1, 4):
my_cell = ws2_obj.cell(row=1, column=i * int(step) - (int(step) - 1))
my_cells.append(my_cell)
print("My_Cell:", my_cells)
new_cells = []
for i in my_cells:
new_cells.append(re.findall("\W\w\d", str(i)))
new_new_cells = []
for i in new_cells:
new_new_cells.append(i[0])
print("new_new_cells:", new_new_cells)
final_list = [re.sub('[^a-zA-Z0-9]+', '', _) for _ in new_new_cells]
print("final list:", final_list)
And the output will be ['A1', 'H1', 'O1']
and then I can output the graph:
for i in range(1, 4):
c1 = AreaChart3D()
# my_cell = ws2_obj.cell(row=i, column=i * int(step))
cats = Reference(ws_obj, min_col=1, min_row=7, max_row=255)
data = Reference(ws_obj, min_col=2, min_row=6, max_col=i * int(step), max_row=255)
c1.title = ws_obj.cell(row=1, column=i * int(step)).value
c1.legend = None
c1.style = 20
c1.y_axis.title = 'Time'
c1.x_axis.title = 'Location'
c1.z_axis.title = "Velocity"
c1.add_data(data, titles_from_data=True)
c1.set_categories(cats)
c1.x_axis.scaling.max = 75
c1.y_axis.scaling.max = 50
c1.z_axis.scaling.max = 25
ws2_obj.add_chart(c1, str(final_list[i - 1]))

You can create a list of the series data (position where the data series starts). The list has 1 element per series. Iterate the list creating a chart for each and ensure you have some means to place the chart in a unique position.
Example code with comments below.
import openpyxl as xl
from openpyxl.chart import (
AreaChart3D,
Reference,
)
def create_chart(tl, maxr, hdr, x_ax):
"""
Creates a standard Area 3D Chart
"""
cht = AreaChart3D()
cht.legend = None
cht.style = 15
cht.title = hdr + " Chart"
cht.x_axis.title = x_ax
cht.y_axis.title = 'Something' # Some text for the y axis
data = Reference(ws_obj, min_col=tl[0], min_row=tl[1], max_col=tl[0]+1, max_row=maxr-1)
cht.add_data(data, titles_from_data=True)
return cht
## Sheet constants
chart_header = 'Speed' # It is assumed this is located in a merged cell
x_axis_header = 'Location'
series_topleft_header = 25
## Load Workbook and Sheet of Excel with data series
wb_obj = xl.load_workbook('Plots.xlsx')
ws_obj = wb_obj.active
## Get the total used rows in the sheet (end of the series table)
maxrows = ws_obj.max_row
speed_row = ''
speed_col_start = ''
speed_col_end = ''
speed_col_letter = ''
## Get a list of Merged cell in the sheet these contain the Headers for position referencing
merge_list = [m.coord for m in ws_obj.merged_cells.ranges]
## Search for the row with Header name 'Speed' to use as reference for series data postioning
for merge_element in ws_obj.merged_cells:
merge_cell_val = merge_element.start_cell.internal_value
if merge_cell_val.lower() == chart_header.lower():
speed_row = merge_element.max_row
speed_col_start = merge_element.min_col
speed_col_end = merge_element.max_col
speed_col_letter = merge_element.start_cell.column_letter
series_header_row = speed_row + 1
series1_start = speed_col_letter + str(series_header_row+1)
"""
Obtain the location of the top left cell where the series data exists
This searches the row below the header (containing the text 'Speed') for the first
series header (i.e. 25 in the example) and adds each position to the series_postion_list
"""
series_position_list = []
for row in ws_obj.iter_rows(min_row=series_header_row,
max_row=series_header_row,
min_col=speed_col_start,
max_col=speed_col_end):
for cell in row:
if cell.value == series_topleft_header:
series_position_list.append([cell.column, series_header_row])
## Create the Charts
"""
With the series_position_list indicating the top left cell of the series data
and the number of rows in the series determined be the maxrows - 1. This data
can be passed to the create_chart function to create the chart.
Charts are placed below the series data table from Column A with two charts
per row. First row for chart location is 2 rows below the series table.
"""
chart_start_row = maxrows + 2
chart_col = 'A'
"""
The series_position_list is used to create 1 chart per series
The chart creation function takes the top left coordinate and max rows along
with Chart header name and x axis header name
"""
for enum, top_left in enumerate(series_position_list, 1):
chart_obj = create_chart(top_left,
maxrows,
chart_header + ' ' + str(enum),
x_axis_header)
## This sets the position the chart will be placed. Based on standard size
## of plot area the charts are 16 rows and 10 columns apart
if enum == 1:
pass
elif enum % 2 == 1:
chart_col = 'A'
chart_start_row += 16
else:
chart_col = 'J'
## Adds chart to the Excel sheet
print(f"Adding chart {chart_header + ' ' + str(enum)} to Excel:")
print(f"Series Data Start; Row:{str(top_left[1]+1)} Column:{top_left[0]}")
ws_obj.add_chart(chart_obj, chart_col + str(chart_start_row))
print("--------------\n")
wb_obj.save("Plots.xlsx")
-----------------Additional Information--------------
add_chart is a method that accepts two arguments; the chart object and optionally an anchor point (i.e the top left cell where the chart is placed in the sheet). Use of .value at the end of
ws2_obj.add_chart(c1, cell(row=2, column=i)).value
is invalid as you are not entering the method into the cell you are using the method to add the chart object c1 at position cell(row=2, column=i). Using cell(row=2, column=i) is also an invalid syntax. You may have meant to use ws2_obj.cell(row=2, column=i) as the anchor. This would be accepted by the add_chart method however when saving the worksheet there would be an error on checking the anchor point as this expects the anchor to be an "Excel style coordinate" i.e. a string like 'A2' rather than a cell object like ws2_obj.cell(row=2, column=i). Even using (2, 1) would fail the same check.
To set the anchor points I will show how to do two options; All charts on the same row and X charts across the row then start next X charts on the next row etc.
Place all charts on same row;
If you are going to put all charts on the same row then the row coord will not change and only the column position needs adjustment for each chart.
You can generate the anchor points like below, the example code uses a for loop with 18 elements;
from openpyxl.utils.cell import coordinate_to_tuple
from openpyxl.utils import get_column_letter
anchor = 'A2' # Position of anchor, first anchor point is 'A2'
column_separation = 9 # Number of columns to separate each chart
for i in range(0, 18):
coord_tuple = coordinate_to_tuple(anchor)
row = coord_tuple[0]
col_offset = column_separation if i > 0 else 0
col_new = get_column_letter(coord_tuple[1] + col_offset)
anchor = f'{col_new}{row}'
print(f'Adding chart at Anchor point {anchor}')
ws2_obj.add_chart(c1, anchor)
This will put the chart at the following achor points;
A2, J2, S2, AB2, AK2, AT2, BC2, BL2, BU2, CD2, CM2, CV2, DE2, DN2, DW2, EF2, EX2, EO2
Placing the charts is a pattern.
Placing the charts is a pattern of rows and columns is similar to the previous code however when the number of charts reaches your limit the 'row' value has to change and the column resets back to 'A'.
The example code again uses a for loop with 18 elements and splits the charts into rows of max_chart_row, set to 5 in this case;
from openpyxl.utils.cell import coordinate_to_tuple
from openpyxl.utils import get_column_letter
anchor = 'A2'
column_separation = 9
max_chart_row = 5
for i in range(0, 18):
coord_tuple = coordinate_to_tuple(anchor)
row = coord_tuple[0]
col_offset = column_separation if i > 0 else 0
# When the number of charts across the row is reached, set the row to 16 more than the current
# and reset the column offset to 0
if i % (max_chart_row) == 0 and i != 0:
row = row + 16
col_offset = 0
col_new = get_column_letter(col_offset+1)
else:
col_new = get_column_letter(coord_tuple[1] + col_offset)
anchor = f'{col_new}{row}'
print(f'Adding chart at Anchor point {anchor}')
ws2_obj.add_chart(c1, anchor)
This will put the chart at the following achor points;
A2, J2, S2, AB2, AK2,
A18, J18, S18, AB18, AK18,
A34, J34, S34, AB34, AK34,
A50, J50, S50

Related

How can I update my Dataframe with new columns and rows while webscraping?

I'm trying to create a webscraping tool that will update a Dataframe with data from multiple tables.
The page I'm working on has a base table in which every row has a link that directs you to a new URL that has a secondary table with the data I'm looking for.
My objective is to create a unique Dataframe comprehensive of all the data present on all secondary tables of the site.
Problem is, every secondary table can have different sets of columns from the previous one, depending on whether that secondary table has a value for that specific column or not, and I cannot know all the possibile column types
I tried multiple solutions. What I'm working on at the moment is to create a for loop that constantly create a new Dataframe out of the new tables and merge them to the previous one.
But I'm stuck on trying to merge the two Dataframes on all the columns they have in common.
Please forgive me if I made amateur mistakes, I've been using python only for a week.
#create the main DataFrame
link1= links[0]
url_linked = url_l + link1
page_linked = requests.get (url_linked)
soup_linked = BeautifulSoup(page_linked.text,'lxml')
table_linked= soup_linked.find('table', class_="XXXXX")
headers_link=[]
headers_unique=[]
for i in table_linked.find_all('th'):
title_link=i.text
title_link=map(str,title_link)
headers_link.append(title_link)
headers_unique=headers_link
mydata_link = pd.DataFrame(columns=headers_link)
count = 1
for link in links:
url_linked = url_l + link
page_linked = requests.get (url_linked)
soup_linked = BeautifulSoup(page_linked.text,'lxml')
table_linked= soup_linked.find('table', class_="table table-directory-responsive")
row2=[]
n_columns =len(table_linked.find_all('th'))
#populating the main dataframe
if count == 1:
for j in table_linked.find_all('tr'):
row_data=j.find_all('td')
row=[i.text for i in row_data]
row2.append(row)
lenght_link= len(mydata_link)
row2.remove(['']) #To get rid of empty rows that have no th
mydata_link.loc[lenght_link]=row2
print(mydata_link)
print('Completato link '+ str(count))
count= count+1
#creating the secondary Dataframe
else:
headers_test=[]
for i in table_linked.find_all('th'):
title_test=i.text
title_test=map(str,title_test)
headers_test.append(title_test)
mydata_temp=pd.DataFrame(columns=headers_test)
for j in table_linked.find_all('tr'):
row_data=j.find_all('td')
row=[i.text for i in row_data]
row2.append(row)
lenght_link= len(mydata_link)
row2.remove(['']) #To get rid of empty rows that have no th
mydata_temp.loc[lenght_link]=row2
print(mydata_temp)
#merge the two DataFrames based on the unique set of columns they both have
headers_unique= set(headers_unique).intersection(headers_test)
mydata_link=mydata_link.merge(mydata_temp, on=[headers_unique], how='outer')
print(mydata_link)
print('Completed link '+ str(count))
count= count+1
What I need is basically a function that, given these sample dataFrames:
A
B
C
1
2
3
C
A
D
E
4
5
6
7
Will return the following dataframe:
A
B
C
D
E
1
2
3
Nan
Nan
5
Nan
4
6
7
Just concatenating all the secondary tables should do - build a list of all the secondary DataFrames, and then pd.concat(dfList).
Btw, have you considered just using .read_html instead of looping through the cells?
#create the main DataFrame
link1 = links[0]
url_linked = url_l + link1
page_linked = requests.get (url_linked)
soup_linked = BeautifulSoup(page_linked.text, 'lxml')
table_linked = soup_linked.find('table', class_="XXXXX")
if table_linked:
primaryDf = pd.read_html(table_linked.prettify())[0]
headers_link = [h.get_text(' ').strip() for h in table_linked.find_all('th')]
dfList = [pd.DataFrame(columns=headers_link if headers_link else primaryDf.columns)]
else: primaryDf, dfList = None, []
count = 0
for link in links:
count += 1
url_linked = url_l + link
page_linked = requests.get (url_linked)
soup_linked = BeautifulSoup(page_linked.text, 'lxml')
table_linked = soup_linked.find('table', class_="table table-directory-responsive")
if not table_linked:
## to see if any response errors or redirects
print(f'[{page_linked.status_code} {page_linked.reason} from {page_linked.url}]')
## print error message and move to next link
print(f'Found no tables with required class at link#{count}', url_linked)
continue
tempDf = pd.read_html(table_linked.prettify())[0] ## read table as df [if found]
## get rid of empty rows and empty columns
tempDf = tempDf.dropna(axis='rows', 'how'='all').dropna(axis='columns', 'how'='all')
dfList.append(tempDf.loc[:]) ## .loc[:] to append a copy, not original (just in case)
print(f'Completed link#{count} with {len(tempDf)} rows from {url_linked}')
combinedDF = pd.concat(dfList)

Update plotly chart based on different button click (load different data)

I am trying to create a plotyly chart with some subplots based on Use button to filter different data in plotly python
The chart generation function takes as input a stock symbol, dict of periods (m1, m3, m5... for different minutes) and stock specific period dataframes.
I am trying the put the periods as buttons, so that on a period button click, the corresponding dataframe (OHLC) can be loaded along with period dependent indicators - MACD, RSI and ADX.
The issue is only the last period df is loaded and the buttons are not showing/ loading the period specific OHLCV.
Below is the function
def plot_plotly_v3(in_stock, in_period_stock_mdf_df_dict,n):
f_stock = in_stock
f_period_stock_mdf_df_dict = in_period_stock_mdf_df_dict
period_buttons = []
i = 0
period_length = len(f_period_stock_mdf_df_dict) # to calculate visible args
period_frequency_dict = config.c_g_period_python_freq_dict # broker period to python period
for period, stock_period_df in f_period_stock_mdf_df_dict.items():
stock_period_df.index = stock_period_df.index.droplevel([1, 2])
fig = make_subplots(rows=4, cols=1, shared_xaxes=True, vertical_spacing=0.007, row_heights=[.35, .20, .20, .25],
subplot_titles=('', 'MACD', 'RSI', 'ADX'))
# removing all empty dates and build complete timeline from start date to end date
py_frequency = period_frequency_dict.get(period) # broker period to python period mapping
dt_all = pd.date_range(start=stock_period_df.index[0], end=stock_period_df.index[-1], freq=py_frequency)
# retrieve the dates that ARE in the original datset
dt_obs = [d.strftime("%Y-%m-%d %H:%M:%S") for d in pd.to_datetime(stock_period_df.index)]
# define dates with missing values
dt_breaks = [d for d in dt_all.strftime("%Y-%m-%d %H:%M:%S").tolist() if not d in dt_obs]
in_period_int = int(config.g_period_2_period_int_dict.get(period))
dvalue_ms = in_period_int * 60 * 1000
fig.update_xaxes(rangebreaks=[dict(values=dt_breaks, dvalue=dvalue_ms)])
fig_title = in_stock + ' for period ' + period + ' for range ' + 'From: ' + \
str(stock_period_df.index[0]) + ' To: ' + str(stock_period_df.index[-1])
for annotation in fig['layout']['annotations']:
annotation['textangle'] = -90
fig.update_annotations(xshift=-620, yshift=-100)
# Plot OHLC and MAs on 1st subplot
# Plot MACD trace on 2nd row
# Plot RSI trace on 3rd row
# Plot ADX trace on 4th row
# create visible args - True/ False list depending upon period/df dict
visible_args = create_true_false_list_v0(i, period_length)
# create a button object for the period we are on
button = dict(label=period, method="update", args=[{"visible": visible_args}])
# add the button to our list of buttons
period_buttons.append(button)
# i is an iterable used to tell our "args" list which value to set to True
i += 1
fig.update_layout(updatemenus=[dict(type="buttons",direction="right", x = 1,y = 1,buttons = period_buttons)],
height=800, width=1350, title=fig_title, xaxis_rangeslider_visible=False)
fig.show()
Would appreciate any support/ guidance.
Question:
1] Is it possible to display the title in the same level as toolbar/modebar.
2] Is it possible to display the period buttons in the same level as the toolbar.

Concatenating tables with axis=1 in Orange python

I'm fairly new to Orange.
I'm trying to separate rows of angle (elv) into intervals.
Let's say, if I want to separate my 90-degree angle into 8 intervals, or 90/8 = 11.25 degrees per interval.
Here's the table I'm working with
Here's what I did originally, separating them by their elv value
Here's the result that I want, x rows 16 columns separated by their elv value.
But I want them done dynamically.
I list them out and turn each list into a table with x rows and 2 columns.
This is what I originally did
from Orange.data.table import Table
from Orange.data import Domain, Domain, ContinuousVariable, DiscreteVariable
import numpy
import pandas as pd
from pandas import DataFrame
df = pd.DataFrame()
num = 10 #number of intervals that we want to seperate our elv into.
interval = 90.00/num #separating them into degree/interval
low = 0
high = interval
table = []
first = []
second = []
for i in range(num):
between = []
if i != 0: #not the first run
low = high
high = high + interval
for row in in_data: #Run through the whole table to see if the elv falls in between interval
if row[0] >= low and row[0] < high:
between.append(row)
elv = "elv" + str(i)
err = "err" + str(i)
domain = Domain([ContinuousVariable.make(err)],[ContinuousVariable.make(elv)])
data = Table.from_numpy(domain, numpy.array(between))
print("table number ", i)
print(data[:3])
Here's the output
But as you can see, these are separated tables being assigned every loop.
And I have to find a way to concatenate axis = 1 for these tables.
Even the source code for Orange3 forbids this for some reason.

Python 3 openpyxl finding all of string in column

I am just starting to learn python and am looking for some direction on a script I am working on to text out daily pick up for my drivers. The vendor name is entered into a spreadsheet along with a purchase order # and notes. What i would like to do is cycle through column "A", find all instances of a vendor name, grab the corresponding B & C cell values and save all info to a text file. I can get it to work if I name the search string explicitly but not if its a variable. Here is what I have so far:
TestList=[]
TestDict= {}
LineNumber = 0
for i in range(1, maxrow + 1):
VendorName = sheet.cell(row = i, column = 1)
if VendorName.value == "CERTIFIED LETTERING":#here is where im lost
#print (VendorName.coordinate)
VendLoc = str(VendorName.coordinate)
TestList.append(VendLoc)
TestDict[VendorName.value]=[TestList]
test = (TestDict["CERTIFIED LETTERING"][0])
ListLength = (len(test))
ListPo = []
List_Notes = []
number = 0
for i in range (0, ListLength):
PO = (str('B'+ test[number][1]))
Note = (str('C'+ test[number][1]))
ListPo.append(PO)
List_Notes.append(Note)
number = number + 1
number = 0
TestVend =(str(VendorName.value))
sonnetFile = open('testsaveforpickups.txt', 'w')
sonnetFile.write("Pick up at:" + '\n')
sonnetFile.write(str(VendorName.value)+'\n')
for i in range (0, ListLength):
sonnetFile.write ("PO# "+ str(sheet[ListPo[number]].value)+'\n'
+"NOTES: " + str(sheet[List_Notes[number]].value)+'\n')
number = number + 1
sonnetFile.close()
the results are as follows:
Pick up at:
CERTIFIED LETTERING
PO# 1111111-00
NOTES: aaa
PO# 333333-00
NOTES: ccc
PO# 555555-00
NOTES: eee
I've tried everything i could think of to change the current string of "CERTIFIED LETTERING" to a variable name, including creating a list of all vendors in column A and using that as a dictionary to go off of. Any help or ideas to point me in the right direction would be appreciated. And I apologise for any formatting errors. I'm new to posting here.

Odd Column returns when scraping with lxml

I am learning python and trying to build a scraper to glean parts data from a suppliers site. My issue now is that I am getting different column counts from my parsed table rows where I know that each row has the same column count. The issue has to be something I am overlooking and after two days of trying different things I am asking for a few more sets of eyes on my code to locate my error. Not having much python coding experience is no doubt my biggest hurdle.
First, the data. Rather than paste the html I have stored in my database, I'll give you a link to the live site I have crawled and stored in my db. The first link is this one.
The issue is that I get mostly correct results. However, every so often I get the values skewed in the column count. I can't seem to locate the cause.
Here is an example of the flawed result:
----------------------------------------------------------------------------------
Record: 1 Section:Passenger / Light Truck Make: ACURA SubMake:
Model: CL SubModel: Year: 1997 Engine: L4 1.6L 1590cc
----------------------------------------------------------------------------------
Rec:1 Row 6 Col 1 part Air Filter
Rec:1 Row 6 Col 2 2
Rec:1 Row 6 Col 3 part_no 46395
Rec:1 Row 6 Col 4 filter_loc
Rec:1 Row 6 Col 5 engine
Rec:1 Row 6 Col 6 vin_code V6 3.0L 2997cc
Rec:1 Row 6 Col 7 comment Engine Code J30A1
** Note that the engine value has been shifted to the vin_code field.
And proof it works some of the time:
Record: 2 Section:Passenger / Light Truck Make: ACURA SubMake:
Model: CL SubModel: Year: 1998 Engine: L4 1.6L 1590cc
----------------------------------------------------------------------------------
Rec:3 Row 4 Col 1 part Oil Filter
Rec:3 Row 4 Col 2 2
Rec:3 Row 4 Col 3 part_no 51334
Rec:3 Row 4 Col 4 filter_loc
Rec:3 Row 4 Col 5 engine L4 2.3L 2254cc
Rec:3 Row 4 Col 6 vin_code
Rec:3 Row 4 Col 7 comment Engine Code F23A1
** Note the fields line up in this record...
I suspect either there is something in the table cells my parser is not looking for or I have missed something trivial.
Here is the important portion of my code:
# Per Query
while records:
# Per Query Loop
#print str(records)
for record in records:
print 'Record Count:'+str(rec_cnt)
items = ()
item = {}
source = record['doc']
page = html.fromstring(source)
for rows in page.xpath('//div/table'):
#records = []
item = {}
cntx = 0
for row in list(rows):
cnty = 0 # Column Counter
found_oil = 0 # Found oil filter record flag
data = {} # Data
# Data fields
field_data = {'part':'', 'part_no':'', 'filter_loc':'', 'engine':'', 'vin_code':'', 'comment':'', 'year':''}
print
print '----------------------------------------------------------------------------------'
print 'Record: '+str(record['id']), 'Section:'+str(record['section']), 'Make: '+str(record['make']), 'SubMake: '+str(record['submake'])
print 'Model: '+str(record['model']), 'SubModel: '+str(record['submodel']), 'Year: '+str(record['year']), 'Engine: '+str(record['engine'])
print '----------------------------------------------------------------------------------'
#
# Rules for extracting data columns
# 1. First column always has a link to the bullet image
# 2. Second column is part name
# 3. Third column always empty
# 4. Fourth column is part number
# 5. Fith column is empty
# 6. Sixth column is part location
# 7. Seventh column is always empty
# 8. Eigth column is engine size
# 9. Ninth column is vin code
# 10. Tenth column is COmment
# 11. Eleventh column does not exist.
#
for column in row.xpath('./td[#class="blackmedium"][text()="0xa0"] | ./td[#class="blackmedium"][text()="\n"]/text() | ./td[#class="blackmeduim"]/img[#src]/text() | ./td[#class="blackmedium"][text()=""]/text() | ./td[#class="blackmedium"]/b/text() | ./td[#class="blackmedium"]/a/text() |./td[#class="blackmedium"]/text() | ./td[#class="blackmedium"][text()=" "]/text() | ./td[#class="blackmedium"][text()="&#160"]/text() | ./td[#class="blackmedium"][text()=None]/text()'):
#' | ./td[position()>1]/a/text() | ./td[position()>1]/text() | self::node()[position()=1]/td/text()'):
cnty+=1
if ('Oil Filter' == column.strip() or 'Air Filter' == column.strip()) and found_oil == 0:
found_oil = 1
if found_oil == 1:
print 'Rec:'+str(rec_cnt), 'Row '+str(cntx), 'Col '+str(cnty), _fields[cnty], column.strip()
#cnty+= 1
#print
else:
print 'Rec: '+str(rec_cnt), 'Col: '+str(cnty)
field_data[ str(_fields[cnty]) ] = str(column.strip())
#cnty = cnty+1
# Save data to db dest table
if found_oil == 1:
data['source_id'] = record['id']
data['section_id'] = record['section_id']
data['section'] = record['section']
data['make_id'] = record['make_id']
data['make'] = record['make']
data['submake_id'] = record['submake_id']
data['submake'] = record['submake']
data['model_id'] = record['model_id']
data['model'] = record['model']
data['submodel_id'] = record['submodel_id']
data['submodel'] = record['submodel']
data['year_id'] = record['year_id']
data['year'] = record['year']
data['engine_id'] = record['engine_id']
data['engine'] = record['engine']
data['part'] = field_data['part']
data['part_no'] = field_data['part_no']
data['filter_loc'] = field_data['filter_loc']
data['vin_code'] = field_data['vin_code']
data['comment'] = conn.escape_string(field_data['comment'])
data['url'] = record['url']
save_data(data)
print 'Filed Data:'
print field_data
cntx+=1
rec_cnt+=1
#End main per query loop
delay() # delay if wait was passed on cmd line
records = get_data()
has_offset = 1
#End Queries
Thank you all for your help and your eyes...
Usually when I run into a problem like this, I do two things:
Break the problem down into smaller chunks. Use python functions or classes to perform subsets of functionality so that you can test the functions individually for correctness.
Use the Python Debugger to inspect the code as it runs to understand where it's failing. For example, in this case, I would add import pdb; pdb.set_trace() before the line that says cnty+=1.
Then, when the code runs, you'll get an interactive interpreter at that point and you can inspect the various variables and discover why you're not getting what you expect.
A couple of tips for using pdb:
Use c to allow the program to continue (until the next breakpoint or set_trace); Use n to step to the next line in the program. Use q to raise an Exception (and usually abort).
Can you pass the details of your scrapping process? The intermittent failures could be based on the parsing of the html data.
The problem seems to be that your xpath expression searches for text nodes. No matches are found for empty cells, causing your code to "skip" columns. Try iterating over the td elements themselves, and then "look down" from the element to its contents. To get you started:
# just iterate over child elements of the row, which are always td
# use enumerate to easily get a counter for the columns
for col_no, td in enumerate(row, start=1):
# use the xpath function string() to get the string value for the element
# this will yield an empty string for empty elements
print col_no, td.xpath('string()')
Note that the use of the string() xpath function may in some cases be not enough/too simple for what you want. In your example, you may find something like <td><a>51334</a><sup>53</sup></td> (see oil filter). My example would give you "5133453", where you would seem to need "51334" (not sure if that was intentional or if you hadn't noticed the "missing" part, if you do want only the in the hyperlink, use td.findtext('a'))
I want to thank everyone who has given aid to me these past few days. All your input has resulted in a working application that I am now using. I wanted to post the resulting changes to my code so those who look here may find an answer or at least information on how they may also tackle their issue. Below is the rewritten portion of my code that solved the issues I was having:
#
# get_column_index()
# returns a dict of column names/column number pairs
#
def get_column_index(row):
index = {}
col_no = 0
td = None
name = ''
for col_no, td in enumerate(row, start=0):
mystr = str(td.xpath('string()').encode('ascii', 'replace'))
name = str.lower(mystr).replace(' ', '_')
idx = name.replace('.', '')
index[idx] = col_no
if int(options.verbose) > 2:
print 'Field Index:', str(index)
return index
def run():
global has_offset
records = get_data()
#print 'Records', records
rec_cnt = 0
# Per Query
while records:
# Per Query Loop
#print str(records)
for record in records:
if int(options.verbose) > 0:
print 'Record Count:'+str(rec_cnt)
items = ()
item = {}
source = record['doc']
page = html.fromstring(source)
col_index = {}
for rows in page.xpath('//div/table'):
#records = []
item = {}
cntx = 0
for row in list(rows):
data = {} # Data
found_oil = 0 #found proper part flag
# Data fields
field_data = {'part':'', 'part_no':'', 'part_note':'', 'filter_loc':'', 'engine':'', 'vin_code':'', 'comment':'', 'year':''}
if int(options.verbose) > 0:
print
print '----------------------------------------------------------------------------------'
print 'Row'+str(cntx), 'Record: '+str(record['id']), 'Section:'+str(record['section']), 'Make: '+str(record['make']), 'SubMake: '+str(record['submake'])
print 'Model: '+str(record['model']), 'SubModel: '+str(record['submodel']), 'Year: '+str(record['year']), 'Engine: '+str(record['engine'])
print '----------------------------------------------------------------------------------'
# get column indexes
if cntx == 1:
col_index = get_column_index(row)
if col_index != None and cntx > 1:
found_oil = 0
for col_no, td in enumerate(row):
if ('part' in col_index) and (col_no == col_index['part']):
part = td.xpath('string()').strip()
if 'Oil Filter' == part or 'Air Filter' == part or 'Fuel Filter' == part or 'Transmission Filter' == part:
found_oil = 1
field_data['part'] = td.xpath('string()').strip()
# Part Number
if ('part_no' in col_index) and (col_no == col_index['part_no']):
field_data['part_no'] = str(td.xpath('./a/text()')).strip().replace('[', '').replace(']', '').replace("'", '')
field_data['part_note'] = str(td.xpath('./sup/text()')).strip().replace('[', '').replace(']', '').replace("'", '')
# Filter Location
if ('filterloc' in col_index) and (col_no == col_index['filterloc']):
field_data['filter_loc'] = td.xpath('string()').strip()
# Engine
if ('engine' in col_index) and (col_no == col_index['engine']):
field_data['engine'] = td.xpath('string()').strip()
if ('vin_code' in col_index) and (col_no == col_index['vin_code']):
field_data['vin_code'] = td.xpath('string()').strip()
if ('comment' in col_index) and (col_no == col_index['comment']):
field_data['comment'] = td.xpath('string()').strip()
if int(options.verbose) == 0:
print ','
if int(options.verbose) > 0:
print 'Field Data: ', str(field_data)
elif int(options.verbose) == 0:
print '.'
# Save data to db dest table
if found_oil == 1:
data['source_id'] = record['id']
data['section_id'] = record['section_id']
data['section'] = record['section']
data['make_id'] = record['make_id']
data['make'] = record['make']
data['submake_id'] = record['submake_id']
data['submake'] = record['submake']
data['model_id'] = record['model_id']
data['model'] = record['model']
data['submodel_id'] = record['submodel_id']
data['submodel'] = record['submodel']
data['year_id'] = record['year_id']
data['year'] = record['year']
data['engine_id'] = record['engine_id']
data['engine'] = field_data['engine'] #record['engine']
data['part'] = field_data['part']
data['part_no'] = field_data['part_no']
data['part_note'] = field_data['part_note']
data['filter_loc'] = field_data['filter_loc']
data['vin_code'] = field_data['vin_code']
data['comment'] = conn.escape_string(field_data['comment'])
data['url'] = record['url']
save_data(data)
found_oil = 0
if int(options.verbose) > 2:
print 'Data:', str(data)
cntx+=1
rec_cnt+=1
#End main per query loop
delay() # delay if wait was passed on cmd line
records = get_data()
has_offset = 1
#End Queries

Categories