I was scraping this website (https://www.ivolatility.com/options/RVX/) using python module request. The output from the selection of the first table using beautifulsoap is above. Now, inside of this first table I am trying to get a specific value (19.17) from this soup obtained from python module requests.
I would like to achieve it using Beautifulsoap, I don't know how to specifically select the cell where it is saved.
Do any of you have any suggestions?
Output from requests:
<table border="0" bordercolor="red" cellpadding="0" cellspacing="0" width="100%">
<tr>
<td colspan="3"><script language="JavaScript">
function submitCalcForm(event) {
event.preventDefault();
var form = document.getElementById('basicOptionsForm');
var action = form.action;
var regions = ['', 'USA', 'Europe', 'Asia', 'Canada'];
var regionsOptions = form[1];
var selectedRegion = regionsOptions.options[regionsOptions.selectedIndex].value;
var symbol = form[0].value.trim();
var location = (window.location.href.indexOf('.j')>-1)
? (form.action + '?' + form[0].name + '=' + form[0].value + '&' + form[1].name + '=' + selectedRegion)
: ('/options/'+ ((symbol == '') ? '-' : symbol ) +'/'+regions[selectedRegion]);
window.location.href= location;
}
function goToLookup() {
window.location.href= "/options/-/";
}
</script>
<form action="/options.j" id="basicOptionsForm" method="get" onsubmit="submitCalcForm(event);">
<table bgcolor="#ffffff" border="0" cellpadding="0" cellspacing="0">
<tr>
<td>
<table bgcolor="#999999" border="0" cellpadding="0" cellspacing="1">
<tr>
<td bgcolor="#567abb">
<table border="0" cellpadding="1" cellspacing="0" class="table-action">
<tr>
<td><span class="s1w" style="color: #fff;"> Symbol: </span></td><td><input class="s2" name="ticker" size="5" type="text" value="RVX"/></td><td><select class="s2" name="R"><option selected="" value="0">
ALL
</option><option value="1">
USA
</option><option value="2">
Europe
</option><option value="4">
Canada
</option></select></td><td><span class="s2"> </span></td><td><button style="background: #0C6EF8; font-weight: bold; border: 1px solid black;" type="submit">GO!</button></td><td><span class="s2"> </span></td><td><button onclick="goToLookup();" style="background: #0C6EF8; font-weight: bold; border: 1px solid black; color: white; white-space: nowrap;" type="button">
Symbol Lookup</button></td><td><span class="s2"> </span></td>
</tr>
</table>
</td>
</tr>
</table>
</td><td><img border="0" height="1" src="/design/images/0.gif" width="5"/></td><td nowrap=""><b><span class="s4">Russell 2000 Volatility Index</span></b></td><td width="100%"> </td>
</tr>
</table>
</form>
</td>
</tr>
<tr>
<td colspan="3"><img alt="." border="0" height="10" src="/design/images/0.gif" width="1"/></td>
</tr>
<tr valign="top">
<td width="100%"><script type="text/javascript">
<!--
function wr(s) {
document.write(s);
}
var d = new Array(10);
d[20]='N/A';d[25]='-94.06%';d[30]='32.03%';d[35]='34.74';d[56]='N/A';d[61]='N/A';d[66]='10-Apr';d[71]='84.49%';d[97]='N/A';d[102]='03-Oct';d[107]='29-Mar';d[112]='1.43';d[133]='N/A';d[138]='N/A';d[143]='148.97%';d[148]='98.46%';d[174]='N/A';d[179]='-46.88%';d[184]='198.21%';d[189]='0.27';d[210]='N/A';d[215]='N/A';d[220]='25-May';d[225]='110.30%';d[251]='N/A';d[256]='-68.76%';d[261]='75.38%';d[266]='0';d[287]='N/A';d[292]='N/A';d[297]='39.85%';d[302]='120.02%';d[328]='N/A';d[333]='-67.09%';d[338]='69.94%';d[343]='19.17';d[364]='N/A';d[369]='N/A';d[374]='06-Apr';d[379]='06/14/2018';d[405]='N/A';d[410]='-82.49%';d[415]='74.41%';d[441]='N/A';d[446]='N/A';d[451]='164.16%';d[456]='12.93';d[482]='N/A';d[487]='24-May';d[492]='77.70%';d[518]='N/A';d[523]='03-May';d[528]='21-May';d[533]='12/24/2018';d[559]='N/A';d[564]='59.42%';d[569]='84.78%';
wr('<table class="table-data" cellpadding=1 cellspacing=1 border=0 width=100%>');
wr('<tr bgcolor="#cccccc" align=right height=20>');
wr('<td align="center"><font class=s1>Price</font></td>');
wr('<td align="center"><font class=s1>Change (%)</font><img src="/design/images/0.gif" width=4 height=1 border=0/></td>');
wr('<td align="center"><font class=s1>52 wk High</font><img src="/design/images/0.gif" width=4 height=1 border=0/></td>');
wr('<td align="center"><font class=s1>52 wk Low</font><img src="/design/images/0.gif" width=4 height=1 border=0/></td>');
wr('<td align="center"><font class=s1>Stock volume</font>');
wr('<a href="javascript:openHelp(14)" alt="Open Help">');
wr('<img src="/design/images/ico/q_zn.gif" width=8 height=10 border=0 alt="Open Help"/>');
wr('</a><img src="/design/images/0.gif" width=4 height=1 border=0/></td>');
wr('</tr>');
wr('<tr bgcolor="#FFFFFF" align=right height=20>');
wr('<td align="center"><font class=s1>');
wr(d[343]);
wr('</font></td>');
wr('<td align="center"><font class=s1><nobr> ');
wr('<img src="/design/images/ico/up.gif" alt="+" border=0 align="absmiddle" width=7 height=9/> +');
wr(d[189]);
wr(' (+');
wr(d[112]);
wr('%)</nobr></font></td>');
wr('<td align="center"><font class=s1><nobr> ');
wr(d[35]);
wr(' ');
wr(d[533]);
wr('</nobr></font></td><td align="center"><font class=s1><nobr> ');
wr(d[456]);
wr(' ');
wr(d[379]);
wr('</nobr></font></td>');
wr('<td align="center"><font size=-2 class=s1>');
wr(d[266]);
wr('</font></td>');
wr('</tr></table>');
//-->
</script><img border="0" height="10" src="/design/images/0.gif" width="1"/><table border="0" cellpadding="0" cellspacing="0" class="table-data" width="100%">
<tr align="center" bgcolor="
#cccccc
" height="20">
<td align="center" colspan="2"><font class="s2">Current</font></td><td><font class="s2">1 WK AGO</font></td><td><font class="s2">1 MO AGO</font></td><td><font class="s2">52 wk Hi/Date</font></td><td><font class="s2">52 wk Low/Date</font></td>
</tr>
<tr>
<td align="center" bgcolor="
#FFFFFF
" colspan="5" height="20"><font class="s2" color=""> HISTORICAL VOLATILITY <a alt="Open Help" href="javascript:openHelp(4)"><img alt="Open Help" border="0" height="10" src="/design/images/ico/q_zn.gif" width="8"/></a></font></td>
</tr>
<tr align="center" bgcolor="#ffffff">
<td align="right"><font class="s2">10 days</font></td><td><font class="s2">120.02%</font></td><td><font class="s2">84.49%</font></td><td><font class="s2">74.41%</font></td><td><font class="s2">198.21% - 29-Mar</font></td><td><font class="s2">32.03% - 21-May</font></td>
</tr>
<tr align="center" bgcolor="#eeeeee">
<td align="right"><font class="s2">20 days</font></td><td><font class="s2">110.30%</font></td><td><font class="s2">84.78%</font></td><td><font class="s2">69.94%</font></td><td><font class="s2">164.16% - 06-Apr</font></td><td><font class="s2">39.85% - 25-May</font></td>
</tr>
<tr align="center" bgcolor="#ffffff">
<td align="right"><font class="s2">30 days</font></td><td><font class="s2">98.46%</font></td><td><font class="s2">77.70%</font></td><td><font class="s2">75.38%</font></td><td><font class="s2">148.97% - 10-Apr</font></td><td><font class="s2">59.42% - 24-May</font></td>
</tr>
<tr>
<td align="center" bgcolor="
#FFFFFF
" colspan="5" height="20"><font class="s2" color=""> IMPLIED VOLATILITY <img alt="Open Help" border="0" height="10" src="/design/images/ico/q_zn.gif" width="8"/></font></td>
</tr>
<tr align="center" bgcolor="#ffffff">
<td align="right"><font class="s2">IV Index call <img alt="Open Help" border="0" height="10" src="/design/images/ico/q_zn.gif" width="8"/></font></td><td><font class="s2">N/A</font></td><td><font class="s2">N/A</font></td><td><font class="s2">N/A</font></td><td><font class="s2">N/A - N/A</font></td><td><font class="s2">N/A - N/A</font></td>
</tr>
<tr align="center" bgcolor="#eeeeee">
<td align="right"><font class="s2">IV Index put <img alt="Open Help" border="0" height="10" src="/design/images/ico/q_zn.gif" width="8"/></font></td><td><font class="s2">N/A</font></td><td><font class="s2">N/A</font></td><td><font class="s2">N/A</font></td><td><font class="s2">N/A - N/A</font></td><td><font class="s2">N/A - N/A</font></td>
</tr>
<tr align="center" bgcolor="#ffffff">
<td align="right"><font class="s2">IV Index mean <img alt="Open Help" border="0" height="10" src="/design/images/ico/q_zn.gif" width="8"/></font></td><td><font class="s2">N/A</font></td><td><font class="s2">N/A</font></td><td><font class="s2">N/A</font></td><td><font class="s2">N/A - N/A</font></td><td><font class="s2">N/A - N/A</font></td>
</tr>
<tr>
<td align="center" bgcolor="
#FFFFFF
" colspan="5" height="20"><font class="s2" color="">HISTORICAL 30-DAYS CORRELATION AGAINST S&P 500 Index (SPX)<img alt="Open Help" border="0" height="10" src="/design/images/ico/q_zn.gif" width="8"/></font></td>
</tr>
<tr align="center" bgcolor="#ffffff">
<td align="right"><font class="s2">30 days</font></td><td><font class="s2">-82.49%</font></td><td><font class="s2">-67.09%</font></td><td><font class="s2">-68.76%</font></td><td><font class="s2">-46.88% - 03-Oct</font></td><td><font class="s2">-94.06% - 03-May</font></td>
</tr>
</table>
</td>
</tr>
</table>
The page is dynamic so you'd need to render the page first with something like Selenium.
Also, you can use BeautfifulSoup, or even Selenium, to parse the html once you have it. But I noticed that it's located within <table> tags. Whenever I see a <table> tag, I usually opt to go with pandas' .read_html() as it'll do the hard work for you.
.read_html() will return a list of dataframes, then it's just a matter of finding the data you want, or maniupulate the table as needed. The data you want was found in the dataframe in index position 4, (it was also in position 0, but I choose to go with 4 since it was right there, 2nd row, first column). Then just slice that dataframe to get hat specific cell:
from selenium import webdriver
import pandas as pd
driver = webdriver.Chrome('C:/chromedriver_win32/chromedriver.exe')
url = 'https://www.ivolatility.com/options/RVX/'
driver.get(url)
tables = pd.read_html(driver.page_source)
price = tables[4][0][1]
driver.close()
Output:
print (price)
19.17
Related
i am trying to generate pdf using xhtml2pdf in django for other than english but it shows black square boxes and unusual texts.
i am trying to render nepali text in my pdf but i m getting this issue, can anyone help me to solve this issue ,,
import imp
from io import StringIO
# from weasyprint import HTML
from django.template.loader import render_to_string
import tempfile
from django.http import HttpResponse
from django.conf import settings
import datetime
# A stream implementation using an in-memory bytes buffer
# It inherits BufferIOBase
from django.http import HttpResponse
from django.template.loader import get_template
from xhtml2pdf import pisa # a html2pdf converter
def render_to_pdf(template_src, context_dict={}):
"""
This method will converts the template file into pdf
#params template_src is the template file to be converted
#params context_dict is the dictionary containing all the data written into the template
"""
template = get_template(template_src)
context_dict.update(
{
"invoice_title": context_dict["invoice_title"]
+ str(datetime.datetime.now())
+ ".pdf"
}
)
# this will render the html template and parse the data into the template
html = template.render(context_dict)
# result = StringIO()
response = HttpResponse(content_type="application/pdf")
response["Content-Disposition"] = (
"inline;attachment; filename="
+ context_dict["invoice_title"]
+ str(datetime.datetime.now())
+ ".pdf"
)
response["Content-Transfer-Encoding"] = "utf8"
# part will create the pdf.
# pdf = pisa.pisaDocument(BytesIO(html.encode("ISO-8859-1")), result)
pisa_status = pisa.pisaDocument(html.encode("UTF-8"), response)
# pisa_status= pisa.CreatePDF(
# html, dest=response
# )
if pisa_status.err:
return HttpResponse("We had some errors <pre>" + html + "</pre>")
return response
html template:
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1">
<title>Purchase Invoice</title>
<style>
.heading {
font-size: 12px;
font-display: bold;
text-align: center;
padding-top: 50px;
}
.subheading {
font-size: 12px;
text-align: center;
}
.col-12 {
width: 100%;
float: left;
}
.col-3 {
width: 25%;
float: left;
}
.col-6 {
width: 50%;
float: left;
}
.textalignright {
text-align: right;
font-size: 12px;
}
.detail {
font-size: 12px;
/* font-weight: bold; */
}
table,
th,
td {
border: 1px solid black;
border-collapse: collapse;
padding: 3px 3px 0px 3px;
}
.w-5 {
width: 10%;
}
.w-25 {
width: 50%;
}
</style>
</head>
<body>
<div class="container">
<div class="Company_profile">
<div class="row">
<div class="col-12 heading">
<h1 class="text-white">{{request.user.company.print_name}}</h1>
</div>
</div>
<div class="row">
<table style="width:100%; border:none;">
<tr style="border:none;">
<td style="width:33.33%; border:none;"></td>
<td style="width:33.33%; border:none; text-align:center;">Email : {{request.user.company.email}}
</td>
<td style="width:33.33%; border:none; text-align:right;">Times Printed :1</td>
</tr>
<tr style="border:none;">
<td style="width:33.33%; border:none;"></td>
<td style="width:33.33%; border:none; text-align:center;">VAT Registration No: {{request.user.company.it_pan}}</td>
<td style="width:33.33%; border:none; text-align:right;">Copy of Original</td>
</tr>
</table>
</div>
<div class="row">
<div class="col-12 subheading">
<h3 class="float-left">कर बिजक</h3>
</div>
</div>
</div>
<div class="Customer_info">
<div class="row">
<table style="width:100%; border:none;">
<tr style="border:none;">
<td style="width:50%; border:none;" class="detail">Invoice No.:{{purchase_data.voucherno}}</td>
<td style="width:50%; border:none; text-align:right;" class="detail">Invoice Date :{{purchase_data.formatted_nepalidate}}</td>
</tr>
<tr style="border:none;">
<td style="width:50%; border:none;" class="detail">Customer Name:{{purchase_data.party.name}}</td>
<td style="width:50%; border:none;"></td>
</tr>
<tr style="border:none;">
<td style="width:50%; border:none;" class="detail">Customer Address :{{purchase_data.party.address}}</td>
<td style="width:50%; border:none;"></td>
</tr>
<tr style="border:none;">
<td style="width:50%; border:none;" class="detail">Customer's VAT/PAN No:{{purchase_data.party.it_pan}}</td>
<td style="width:50%; border:none;"></td>
</tr>
</table>
</div>
</div>
<br />
<br />
<div class="Sales_detail">
<table style="width:100%;">
<thead>
<tr>
<th class="w-5">S.N</th>
<th class="w-25">Particulars</th>
<th class="w-5">Qty</th>
<th class="w-5">Unit </th>
<th class="w-5"> Price</th>
<th class="w-5">Amount</th>
</tr>
</thead>
<tbody>
{% for purchaseItem in purchase_items %}
<tr>
<td scope="row">{{ forloop.counter }}</th>
<td>{{purchaseItem.item.name}}</td>
<td>{{purchaseItem.quantity}}</td>
<td>{{purchaseItem.unit.name}}</td>
<td>{{purchaseItem.formatted_price}}</td>
<td>{% if purchase_data.is_line_discount %}{{purchaseItem.formatted_amount}}{% else %}{{purchaseItem.formatted_amount}}{% endif %}</td>
</tr>
{% endfor %}
<tr>
<td colspan="5" style="text-align:left;">Total Amount</td>
<td>{{purchase_data.total_amount}}</td>
</tr>
<tr>
<td colspan="5" style="text-align:left;">Discount Amount</td>
<td>{{purchase_data.trade_discount_amount}}</td>
</tr>
<tr>
<td colspan="5" style="text-align:left;">Total Taxable Amount</td>
<td>{{purchase_data.taxable_amount}}</td>
</tr>
<tr>
<td colspan="5" style="text-align:left;">13% VAT</td>
<td>{{purchase_data.vat_amount}}</td>
</tr>
<tr>
<td colspan="5" style="text-align:left; font-weight:Bold;">Total NPR Incl. VAT</td>
<td style="font-weight:Bold;">{{purchase_data.total_amount_inc_vat}}</td>
</tr>
</tbody>
</table>
<div class="row">
<div class="col-12 detail">
<p><b>Total Amount in Words :</b> {{purchase_data.formatted_amountwords}}
</p>
</div>
</div>
</div>
<div class="footer" style="margin-top:100px ;">
<table style="width:100%; border:none;">
<tr style="border:none;">
<td style="width:100%; border:none;">
<p class="text-left">(E. & O. E) Goods once sold are not exchangeable or returnable.</p>
</td>
</tr>
<tr style="border:none;">
<td style="width:50%; border:none;"></td>
<td style="width:50%; border:none; text-align:right;">-----------------------------</td>
</tr>
<tr style="border:none;">
<td style="width:50%; border:none;"></td>
<td style="width:50%; border:none; text-align:right;">Authorised Signatory</td>
</tr>
<tr style="border:none;">
<td style="width:50%; border:none;"></td>
<td style="width:50%; border:none; text-align:right;">Printed On : {{purchase_data.formatted_nepalidate}}</td>
</tr>
<tr style="border:none;">
<td style="width:50%; border:none;"></td>
<td style="width:50%; border:none; text-align:right;">Printed By: {{request.user.username}}</td>
</tr>
</table>
</div>
</div>
</body>
</html>
I am getting this output with black square but not the text:
Background: I have a large HTML file that has 8 different pages. Some of the pages in the HTML can be larger than the 11in container size and the 11in stipulated in the #page CSS due to a lot of data in some of the tables.
What am I trying to do?: I am trying to send in context data (written in Django / Python) to each table of unknown length. Once the data has been entered then I will use weasyprint to create the pdf. At the top of every page the page number and total number of pages should be added dynamically.
The issue: When I print to PDF the header on the pages with a lot of rows (ones that are >11in) add the header but the header shows the same page number for of the split pages. In the example below it is on page 2 that is split into two pages and when you print to pdf the header on page 3 and 4 are incorrect.
What have I tried?:
Basically everything I could think of. At first I thought about just using paged media but I couldn't figure out how to put this complex of a header on the pages using that method.
Is it possible with just HTML and CSS to do what I want? If it isn't then I may have to figure out a way to add the headers using JS and then saving the HTML before sending it into weasyprint (since weasyprint doesn't support JS). Any suggestions would be appreciated.
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<style>
*{
margin: 0;
padding: 0;
}
#page{
size: 8.5in 11in;
}
table.container{
page-break-after: always;
}
td{
padding: 0;
margin: 0;
}
table tbody tr{
vertical-align: top;
page-break-after: always;
}
.container{
height: 11in;
width: 8.5in;
border: 1px solid black;
padding: 10px;
margin: 10px auto ;
}
.container thead{
height: 225px;
vertical-align:top;
}
.top{
display: flex;
align-items:center;
}
.header{
page-break-before: always;
}
.bottom{
font-family: sans-serif;
font-size: 10px;
}
.main{
margin: 0 30px;
}
.thead{
display: flex;
background-color: rgb(123, 199, 157);
color: white;
font-size: 14px;
border-bottom: 1px solid black;
border: 1px solid black;
}
.thead .col-one{
flex:3;
}
.thead .col-two{
flex: 1;
}
.thead .col-three{
flex: 2;
text-align: center;
}
.thead p{
padding: 10px;
}
.tbody{
display: flex;
color: rgb(23, 184, 109);
font-size: 12px;
font-family: sans-serif;
}
.main-data{
padding: 10px 15px ;
}
.sub-data{
padding: 0 30px 0 ;
}
.test i{
padding-top: 13.3px ;
}
.result i{
padding-top: 10px;
}
.result p{
padding: 1px;
}
.tbody tr td:nth-child(2), tr td:nth-child(3){
padding-left: 230px;
}
.tbody td{
padding: 2px 0;
}
.tbody{
border: 1px solid black;
}
.entry p{
color: rgb(78, 208, 143);
}
.entry{
font-family: sans-serif;
font-size: 13px;
padding: 50px 0 0 ;
}
.sub{
padding: 5px;
}
body{
counter-reset: page pages my-counter 0;
}
.header{
display: table-header-group;
}
.footer{
display: table-footer-group;
}
#media print{
.tbody tr td:nth-child(2), tr td:nth-child(3){
padding-left: 220px;
}
.thead{
display: table-header-group
color: black;
background-color: rgb(69, 213, 129);
}
.thead tr { page-break-inside: avoid; }
}
#page {
#bottom-right {
content: counter(page) " of " counter(pages);
}
}
.dot::after {
content: " : "counter(page) " of " counter(pages);
counter-increment: page 1;
}
</style>
</head>
<body>
<table class="container">
<thead>
<tr>
<td>
<div class="haeder">
<div class="top">
<div class="address">
<h2>Company title</h2>
<p>contact info</p>
</div>
<div class="customer">
<p>Report ID: </p>
</div>
</div>
<div class="head-line">
<h1>Document Title</h1>
<p>Page No<span class="dot"></span> </p>
</div>
<div class="details">
<div class="lab-info">
<p>Info: <span>Lab Number</span></p>
<p>Info: <span>Sample ID from Sample Received</span></p>
</div>
<div class="date">
<p>Date Received: </p>
<p>Date Reported: </p>
</div>
</div>
</td>
</tr>
</thead>
<tfoot>
<tr>
<td>
<div class="footer" >
<div class="bottom">
<div class="first-col">
<p>Some Info</p>
<p>Some more info</p>
</div>
</div>
</div>
</td>
</tr>
</tfoot>
<tbody>
<tr>
<td>
<div class="main" >
<div class="thead">
<p class="col-one" >Table Name</p>
<p class="col-two">Result</p>
<p class="col-three">Result 2</p>
</div>
<div class="tbody">
<div class="main">
<table>
<tr>
<td>Test</td>
<td></td>
<td></td>
</tr>
<tr>
<td class="sup">Test</td>
<td> %</td>
</tr>
<tr>
<td class="sup">Test</td>
<td>%</td>
</tr>
<tr>
<td class="sup">Test</td>
<td>%</td>
</tr>
<tr>
<td class="sup">Test</td>
<td>%</td>
</tr>
<tr>
<td class="sup">Test</td>
<td>%</td>
</tr>
<tr>
<td class="sup">Test</td>
<td>%</td>
</tr>
<tr>
<td class="sup">Test</td>
<td></td>
<td></td>
</tr>
</table>
</div>
<div class="result">
</div>
<div class="test">
</div>
</div>
</div>
</td>
</tr>
</tbody>
</table>
<table class="container">
<thead>
<tr>
<td>
<div class="haeder">
<div class="top">
<div class="address">
<h2>Company title</h2>
<p>contact info</p>
</div>
<div class="customer">
<p>Report ID: </p>
</div>
</div>
<div class="head-line">
<h1>Document Title</h1>
<p>Page No<span class="dot"</p>
</div>
<div class="details">
<div class="lab-info">
<p>Info: <span>Lab Number</span></p>
<p>Info: <span>Sample ID from Sample Received</span></p>
</div>
<div class="date">
<p>Date Received: </p>
<p>Date Reported: </p>
</div>
</div>
</td>
</tr>
</thead>
<tfoot>
<tr>
<td>
<div class="footer" >
<div class="bottom">
<div class="first-col">
<p>Some Info</p>
<p>Some more info</p>
</div>
</div>
</div>
</td>
</tr>
</tfoot>
<tbody>
<tr>
<td>
<div class="main" >
<div class="thead">
<p class="col-one" >Table Name</p>
<p class="col-two">Result</p>
<p class="col-three">Result 2</p>
</div>
<div class="tbody">
<div class="main">
<table>
<tr>
<td>Test</td>
<td></td>
<td></td>
</tr>
<tr>
<td class="sup">Test</td>
<td> %</td>
</tr>
<tr>
<td class="sup">Test</td>
<td>%</td>
</tr>
<tr>
<td class="sup">Test</td>
<td>%</td>
</tr>
<tr>
<td class="sup">Test</td>
<td>%</td>
</tr>
<tr>
<td class="sup">Test</td>
<td> %</td>
</tr>
<tr>
<td class="sup">Test</td>
<td>%</td>
</tr>
<tr>
<td class="sup">Test</td>
<td>%</td>
</tr>
<tr>
<td class="sup">Test</td>
<td>%</td>
</tr>
<tr>
<td class="sup">Test</td>
<td> %</td>
</tr>
<tr>
<td class="sup">Test</td>
<td>%</td>
</tr>
<tr>
<td class="sup">Test</td>
<td>%</td>
</tr>
<tr>
<td class="sup">Test</td>
<td>%</td>
</tr>
<tr>
<td class="sup">Test</td>
<td> %</td>
</tr>
<tr>
<td class="sup">Test</td>
<td>%</td>
</tr>
<tr>
<td class="sup">Test</td>
<td>%</td>
</tr>
<tr>
<td class="sup">Test</td>
<td>%</td>
</tr>
<tr>
<td class="sup">Test</td>
<td> %</td>
</tr>
<tr>
<td class="sup">Test</td>
<td>%</td>
</tr>
<tr>
<td class="sup">Test</td>
<td>%</td>
</tr>
<tr>
<td class="sup">Test</td>
<td>%</td>
</tr>
<tr>
<td class="sup">Test</td>
<td> %</td>
</tr>
<tr>
<td class="sup">Test</td>
<td>%</td>
</tr>
<tr>
<td class="sup">Test</td>
<td>%</td>
</tr>
<tr>
<td class="sup">Test</td>
<td>%</td>
</tr>
<tr>
<td class="sup">Test</td>
<td> %</td>
</tr>
<tr>
<td class="sup">Test</td>
<td>%</td>
</tr>
<tr>
<td class="sup">Test</td>
<td>%</td>
</tr>
<tr>
<td class="sup">Test</td>
<td>%</td>
</tr>
<tr>
<td class="sup">Test</td>
<td> %</td>
</tr>
<tr>
<td class="sup">Test</td>
<td>%</td>
</tr>
<tr>
<td class="sup">Test</td>
<td>%</td>
</tr>
<tr>
<td class="sup">Test</td>
<td>%</td>
</tr>
<tr>
<td class="sup">Test</td>
<td> %</td>
</tr>
<tr>
<td class="sup">Test</td>
<td>%</td>
</tr>
<tr>
<td class="sup">Test</td>
<td>%</td>
</tr>
<tr>
<td class="sup">Test</td>
<td>%</td>
</tr>
<tr>
<td class="sup">Test</td>
<td> %</td>
</tr>
<tr>
<td class="sup">Test</td>
<td>%</td>
</tr>
<tr>
<td class="sup">Test</td>
<td>%</td>
</tr>
<tr>
<td class="sup">Test</td>
<td>%</td>
</tr>
<tr>
<td class="sup">Test</td>
<td> %</td>
</tr>
<tr>
<td class="sup">Test</td>
<td>%</td>
</tr>
<tr>
<td class="sup">Test</td>
<td>%</td>
</tr>
<tr>
<td class="sup">Test</td>
<td>%</td>
</tr>
<tr>
<td class="sup">Test</td>
<td> %</td>
</tr>
<tr>
<td class="sup">Test</td>
<td>%</td>
</tr>
<tr>
<td class="sup">Test</td>
<td>%</td>
</tr>
<tr>
<td class="sup">Test</td>
<td>%</td>
</tr>
<tr>
<td class="sup">Test</td>
<td> %</td>
</tr>
<tr>
<td class="sup">Test</td>
<td>%</td>
</tr>
<tr>
<td class="sup">Test</td>
<td>%</td>
</tr>
<tr>
<td class="sup">Test</td>
<td>%</td>
</tr>
<tr>
<td class="sup">Test</td>
<td> %</td>
</tr>
<tr>
<td class="sup">Test</td>
<td>%</td>
</tr>
<tr>
<td class="sup">Test</td>
<td>%</td>
</tr>
<tr>
<td class="sup">Test</td>
<td>%</td>
</tr>
<tr>
<td class="sup">Test</td>
<td> %</td>
</tr>
<tr>
<td class="sup">Test</td>
<td>%</td>
</tr>
<tr>
<td class="sup">Test</td>
<td>%</td>
</tr>
<tr>
<td class="sup">Test</td>
<td>%</td>
</tr>
<tr>
<td class="sup">Test</td>
<td> %</td>
</tr>
<tr>
<td class="sup">Test</td>
<td>%</td>
</tr>
<tr>
<td class="sup">Test</td>
<td>%</td>
</tr>
<tr>
<td class="sup">Test</td>
<td>%</td>
</tr>
<tr>
<td class="sup">Test</td>
<td> %</td>
</tr>
<tr>
<td class="sup">Test</td>
<td>%</td>
</tr>
<tr>
<td class="sup">Test</td>
<td>%</td>
</tr>
<tr>
<td class="sup">Test</td>
<td>%</td>
</tr>
<tr>
<td class="sup">Test</td>
<td> %</td>
</tr>
<tr>
<td class="sup">Test</td>
<td>%</td>
</tr>
<tr>
<td class="sup">Test</td>
<td>%</td>
</tr>
<tr>
<td class="sup">Test</td>
<td>%</td>
</tr>
<tr>
<td class="sup">Test</td>
<td>%</td>
</tr>
<tr>
<td class="sup">Test</td>
<td>%</td>
<td></td>
</tr>
<tr>
<td class="sup">Test</td>
<td></td>
<td></td>
</tr>
</table>
</div>
<div class="result">
</div>
<div class="test">
</div>
</div>
</div>
</td>
</tr>
</tbody>
</table>
</body>
</html>
At this point, my table looks as follows:
<table border="0" cellpadding="0" cellspacing="0" class="ms-formtable" id="formTbl" style="margin-top: 8px;" width="100%">
<tbody>
<tr>
<td class="ms-formlabel" nowrap="true" valign="top" width="165px">
<h3 class="ms-standardheader">
<a name="SPBookmark_FileLeafRef">
</a>
Name
</h3>
</td>
<td class="ms-formbody" id="SPFieldFile" valign="top" width="450px">
<a href="http://google.com" onclick="DispDocItemEx(this, 'FALSE', 'FALSE', 'FALSE', '');">
X
</a>
</td>
</tr>
<tr>
<td class="ms-formlabel" nowrap="true" valign="top" width="165px">
<h3 class="ms-standardheader">
<a name="SPBookmark_Owner">
</a>
Name#
</h3>
</td>
<td class="ms-formbody" id="SPFieldChoice" valign="top" width="450px">
Z
</td>
</tr>
<tr>
<td class="ms-formlabel" nowrap="true" valign="top" width="165px">
<h3 class="ms-standardheader">
<a name="SPBookmark_DirectiveRank">
</a>
Age
</h3>
</td>
<td class="ms-formbody" id="SPFieldChoice" valign="top" width="450px">
52
</td>
</tr>
<tr>
<td class="ms-formlabel" nowrap="true" valign="top" width="165px">
<h3 class="ms-standardheader">
<a name="SPBookmark_Number">
</a>
number
</h3>
</td>
<td class="ms-formbody" id="SPFieldText" valign="top" width="450px">
1
</td>
</tr>
<tr>
<td class="ms-formlabel" nowrap="true" valign="top" width="165px">
<h3 class="ms-standardheader">
<a name="SPBookmark_Title">
</a>
Name of File
</h3>
</td>
<td class="ms-formbody" id="SPFieldText" valign="top" width="450px">
Funny Names
</td>
</tr>
<tr>
<td class="ms-formlabel" nowrap="true" valign="top" width="165px">
<h3 class="ms-standardheader">
<a name="SPBookmark_EffectiveFrom">
</a>
date
</h3>
</td>
<td class="ms-formbody" id="SPFieldDateTime" valign="top" width="450px">
1.1.2022
</td>
</tr>
</tbody>
</table>
I basically need to open an HTML file, filter table with id "formTbl" and then either create JSON with values : {Firsttd:Secondtd, "Name":"Test", "Date":"Blank"} or insert into database where First td (in tr tag we have 2 td, first it name of column and second is value) in table A and second td in table B. Is there any way? I´ve tried using Python, where I got so far json looks like [["","Name","","Test",""],["","Age","","12",""]] and in C# I´ve tried HTMLAgilityPack but it wasn´t working.
Here is the solution with JQuery.
<html>
<body>
<table id="example-table">
<tr>
<th>Name</th>
<th>Name#</th>
<th>Age</th>
<th>Number</th>
<th>Name of file</th>
<th>Date</th>
</tr>
<tr>
<td>X</td>
<td>Z</td>
<td>52</td>
<td>1</td>
<td>Name of file</td>
<td>2021-22-10</td>
</tr>
</table>
<textarea rows="10" cols="50" id="jsonTextArea">
</textarea>
</body>
</html>
<script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
<script src="https://cdn.jsdelivr.net/npm/table-to-json#1.0.0/lib/jquery.tabletojson.min.js"></script>
<script type="text/javascript">
var tableToJson = $('#example-table').tableToJSON();
var sendingData = JSON.stringify (tableToJson);
$('#jsonTextArea').val(sendingData);
// Send JSON data to backend
$.post('http://localhost/test.php', {sendingData}, function(data, textStatus, xhr) {
var backendResponse = data;
console.log(backendResponse);
});
</script>
I am trying to pick the text located in tables. Dynamic means that there are sometimes one table and sometimes more than one tables. So my question is, how to pick the text from this tables.
This is what i tried:
from selenium import webdriver
# webdriver
browser = webdriver.Chrome("C:/Chrome/chromedriver.exe")
browser.get("http://homepage")
pick = browser.find_elements_by_xpath("//*[#id=\"xpath\"]/table[11]/tbody/tr/td/table[2]/tbody/tr/td[1]/span[2]")
pick.get_attribute("innerHTML")
and this is the xpath from each element:
//*[#id="xpath"]/table[11]/tbody/tr/td/table[2]/tbody/tr/td[1]/span[2]
//*[#id="xpath"]/table[11]/tbody/tr/td/table[3]/tbody/tr/td[1]/span[2]
//*[#id="xpath"]/table[11]/tbody/tr/td/table[4]/tbody/tr/td[1]/span[2]
and this is the html code:
<table style="width:700px; " border="0" cellpadding="0" cellspacing="0" width="700px">
<tbody>
<tr>
<td style="border:1px; border-style:solid; ">
<table style="width:700px; " border="0" cellpadding="0" cellspacing="0" width="700px">
<tbody>
<tr>
<td style="border:1px; border-bottom-color:silver; border-bottom-style:solid; width:250px; " valign="top" width="250px"><span> </span><span style="font-size:10pt; font-weight:bold; "> </span></td>
<td style="border:1px; border-bottom-color:silver; border-bottom-style:solid; width:450px; " valign="middle" width="450px"><span style="font-size:10pt; "> </span><br></td>
</tr>
</tbody>
</table>
<table style="width:700px; " border="0" cellpadding="0" cellspacing="0" width="700px">
<tbody>
<tr>
<td style="border:1px; border-bottom-color:silver; border-bottom-style:solid; width:250px; " valign="top" width="250px"><span> </span><span style="font-size:10pt; ">3</span></td>
<td style="border:1px; border-bottom-color:silver; border-bottom-style:solid; width:450px; " valign="middle" width="450px"><span style="font-size:10pt; ">Bleaching preparations and other substances for laundry use; cleaning, polishing, scouring and abrasive preparations; soaps; perfumery, essential oils, cosmetics, hair lotions; dentifrices (all the goods listed alphabetically in the Nice Classification, included in this class).</span><br></td>
</tr>
</tbody>
</table>
<table style="width:700px; " border="0" cellpadding="0" cellspacing="0" width="700px">
<tbody>
<tr>
<td style="border:1px; border-bottom-color:silver; border-bottom-style:solid; width:250px; " valign="top" width="250px"><span> </span><span style="font-size:10pt; ">4</span></td>
<td style="border:1px; border-bottom-color:silver; border-bottom-style:solid; width:450px; " valign="middle" width="450px"><span style="font-size:10pt; ">Industrial oils and greases; lubricants; dust absorbing, wetting and binding compositions; fuels (including motor spirit) and illuminants; candles, wicks (all goods of this class included in the alphabetical list of Nice Classification).</span><br></td>
</tr>
</tbody>
</table>
<table style="width:700px; " border="0" cellpadding="0" cellspacing="0" width="700px">
<tbody>
<tr>
<td style="border:1px; border-bottom-color:silver; border-bottom-style:solid; width:250px; " valign="top" width="250px"><span> </span><span style="font-size:10pt; ">5</span></td>
<td style="border:1px; border-bottom-color:silver; border-bottom-style:solid; width:450px; " valign="middle" width="450px"><span style="font-size:10pt; ">Pharmaceutical and veterinary preparations; sanitary preparations for medical purposes; dietetic foods and substances adapted for medical and veterinary use; food for babies; dietary supplements for humans and animals;plasters, materials for dressings; material for stopping teeth and dental wax; disinfectants; preparations for destroying vermin; fungicides, herbicides;(all goods of this class included in the alphabetical list of Nice Classification).</span><br></td>
</tr>
</tbody>
</table>
</td>
</tr>
</tbody>
</table>
thank you for any help!
I Need reslove 3 problem:
I'm trying to write a simple program in python that parses a web page for the table of table id="dgContract".
store this table of page 1 ,page2,page3....pageN to mongodb, i don't know how to opration mongodb.
parses Selected "Detail" links of content store to mongodb.if open Selected "view" ,need add http://www.xxx.com/ like http://www.xxx.com/LicConDisp.aspx?CID=xxxxx
open picture in new windows will clear!!!
my code:
import urllib2,cookielib,sys
import urllib,string
import cStringIO,Image,re
import BeautifulSoup # For processing HTML
from BeautifulSoup import BeautifulStoneSoup # For processing XML
from BeautifulSoup import BeautifulSoup
import configparser
from pymongo import Connection
import codecs
import sitecustomize
import chardet
host = 'localhost'
database = 'test'
collection = 'compinfo'
f=file('copy of out4.html','r')
html=f.read()
soup = BeautifulSoup(''.join(html))
table = soup.find('table', id="dgContract")
rows = table.findAll('tr')
store = []
for tr in rows:
cols = tr.findAll('td')
row = []
for td in cols:
try:
row.append(''.join(td.find(text=True)))
except Exception:
row.append('')
store.append('|'.join(row))
print '\n'.join(store)
But output like this: open picture in new windows will clear!!!
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" >
<HEAD>
<title>查询</title>
<meta content="Microsoft Visual Studio .NET 7.1" name="GENERATOR">
<meta content="C#" name="CODE_LANGUAGE">
<meta content="JavaScript" name="vs_defaultClientScript">
<meta content="http://schemas.microsoft.com/intellisense/ie5" name="vs_targetSchema">
<meta http-equiv="X-UA-Compatible" content="IE=EmulateIE7" />
<link href="css/user.css" type="text/css" rel="stylesheet">
<style type="text/css">
.STYLE1 {FONT-SIZE: 12px; COLOR: #ffffff}
.STYLE2 {FONT-SIZE: 14px; COLOR: #000000}
.STYLE45 {FONT-SIZE: 12px}
.STYLE51 {FONT-WEIGHT: bold; FONT-SIZE: 12px; FONT-FAMILY: "宋体"}
.STYLE52 {FONT-WEIGHT: bold; FONT-SIZE: 12px; COLOR: #ffffff; FONT-FAMILY: "宋体"}
</style>
</HEAD>
<body background="images/bg.jpg" MS_POSITIONING="GridLayout">
<form name="Form1" method="post" action="ContractSearcher.aspx" id="Form1">
<div align="center">
<table borderColor="#c7c7c7" cellSpacing="0" cellPadding="0" border="1">
<tr>
<td class="tdBorder">
<!-- content -->
<!--显示用户信息条 -->
<!--内容主体:左侧为菜单,右侧为内容显示区 -->
<table height="350" cellSpacing="0" cellPadding="0" width="760" border="0">
<tr>
<!--左侧菜单项 -->
<td width="3"> </td>
<!--右侧内容显示区 -->
<td vAlign="top" width="815" bgColor="#ffffff">
<table width="100%">
<tr>
<td class="tdbigmidcenter">
<table class="tablebigContent" cellspacing="0" rules="all" border="1" id="dgContract" width="815">
<tr bgcolor="PapayaWhip">
<td>numb</td><td>用户1</td><td>用户2</td>
<td>作者</td>
<td align="center">接受时间</td><td align="center">发送</td>
<td align="center">详情</td>
<td align="center">状态</td>
<td>version</td>
</tr>
<tr>
<td width="21%">HOPE-HT-YX-S-140331-120</td><td width="14%">
A公司
</td><td width="14%">A学校</td><td width="5%">david</td><td align="center" width="10%">
2014-3-31
</td><td align="center" width="10%">
未发送
</td><td align="center" width="7%">
查看
</td><td align="center" width="10%">
<a title="" href="ConStatusDisp.aspx?CID=91e13d7a-e812-428d-a5c2-532778ea4e89">已结束[<font color=red>通过</font>]</a>
</td><td align="center" width="5%">
1.0
</td>
</tr><tr>
<td width="21%">HOPE-HT-YX-S-140328-106</td>
<td width="14%">
A公司
</td>
<td width="14%">M公司</td><td width="5%">王明</td><td align="center" width="10%">
2014-3-28
</td><td align="center" width="10%">
未发货
</td><td align="center" width="7%">
查看
</td><td align="center" width="10%">
<a title="" href="ConStatusDisp.aspx?CID=72648278-dbe3-4577-9154-23182e349b33">已结束[<font color=red>HOPECE140328-5 </font>]</a>
</td><td align="center" width="5%">
1.0
</td>
</tr><tr>
<td width="21%"> </td>
<td width="14%">
B公司
</td>
<td width="14%">C中心</td><td width="5%">王明</td><td align="center" width="10%">
2014-3-12
</td><td align="center" width="10%">
2014-3-28<br>
[<font color=deeppink><strong>全<strong></font>]
</td><td align="center" width="7%">
查看
</td><td align="center" width="10%">
<a title="9900014479" href="ConStatusDisp.aspx?CID=0526a587-85dc-484e-88f4-87967546678f">已结束[<font color=red>HOPETE140313-1 </font>][<font color=deeppink><strong>全<strong></font>]</a>
</td><td align="center" width="5%">
1.0
</td>
</tr><tr>
<td width="21%">HOPE-HT-YX-S-140306-001</td>
<td width="14%">
A公司
</td>
<td width="14%">A中心</td>
<td width="5%">JACK</td><td align="center" width="10%">
2014-3-7
</td><td align="center" width="10%">
2014-3-28<br>
[<font color=deeppink><strong>全<strong></font>]
</td><td align="center" width="7%">
查看
</td><td align="center" width="10%">
<a title="9900014479" href="ConStatusDisp.aspx?CID=dfec1630-e1d4-478c-9feb-415eedbd6184">已结束[<font color=red>HOPETE140317-4 </font>][<font color=deeppink><strong>全<strong></font>]</a>
</td><td align="center" width="5%">
1.0
</td>
</tr><tr>
<td width="21%">HOPE-HT-YX-S-140228-102</td>
<td width="14%">
G公司
</td>
<td width="14%">F公司</td>
<td width="5%">david</td><td align="center" width="10%">
未通过
</td><td align="center" width="10%">
未发货
</td><td align="center" width="7%">
查看
</td><td align="center" width="10%">
<a title="" href="ConStatusDisp.aspx?CID=9e19e1c9-7644-4392-9bdd-89e2bac346cd">已作废</a>
</td><td align="center" width="5%">
1.0
</td>
</tr><tr>
<td width="21%">HOPE-HT-YX-S-140228-005</td>
<td width="14%">
T公司
</td>
<td width="14%">J公司 </td>
<td width="5%">jack</td><td align="center" width="10%">
2014-2-28
</td><td align="center" width="10%">
2014-3-28<br>
[<font color=deeppink><strong>全<strong></font>]
</td><td align="center" width="7%">
查看
</td><td align="center" width="10%">
<a title="9900014480" href="ConStatusDisp.aspx?CID=45039bfb-ccb8-49f4-b8fe-27bc8cf59803">已结束[<font color=red>HOPECE140228-10</font>][<font color=deeppink><strong>全<strong></font>]</a>
</td><td align="center" width="5%">
1.0
</td>
</tr><tr>
<td width="21%">HOPE-HT-YX-S-140228-002</td>
<td width="14%">
S公司
</td>
<td width="14%">V公司</td>
<td width="5%">张军</td><td align="center" width="10%">
2014-2-28
</td><td align="center" width="10%">
2014-3-28<br>
[<font color=deeppink><strong>全<strong></font>]
</td><td align="center" width="7%">
查看
</td><td align="center" width="10%">
<a title="9900014479" href="ConStatusDisp.aspx?CID=02a8d406-a826-4a5a-b466-f4bca2640307">已结束[<font color=red>HOPETE140307-4 </font>][<font color=deeppink><strong>全<strong></font>]</a>
</td><td align="center" width="5%">
1.0
</td>
</tr><tr>
<td width="21%"> </td>
<td width="14%">
A公司
</td>
<td width="14%">W公司</td><td width="5%">jack</td><td align="center" width="10%">
2014-2-28
</td><td align="center" width="10%">
2014-3-28<br>
[<font color=deeppink><strong>全<strong></font>]
</td><td align="center" width="7%">
查看
</td><td align="center" width="10%">
<a title="9900014479" href="ConStatusDisp.aspx?CID=2684c70a-baea-4da4-911b-19cdbe627fd9">已结束[<font color=red>HOPETE140307-3 </font>][<font color=deeppink><strong>全<strong></font>]</a>
</td><td align="center" width="5%">
1.0
</td>
</tr><tr>
<td width="21%">HOPE-HT-YX-S-140228-013</td>
<td width="14%">
B公司
</td><td width="14%">V公司</td>
<td width="5%">rose</td><td align="center" width="10%">
2014-2-28
</td><td align="center" width="10%">
2014-3-28<br>
[<font color=deeppink><strong>全<strong></font>]
</td><td align="center" width="7%">
查看
</td><td align="center" width="10%">
<a title="9900014479" href="ConStatusDisp.aspx?CID=1204ad5e-4552-43af-a650-19b93f9d2514">已结束[<font color=red>HOPETE140307-2 </font>][<font color=deeppink><strong>全<strong></font>]</a>
</td><td align="center" width="5%">
1.0
</td>
</tr><tr>
<td width="21%">HOPE-HT-YX-S-140226-018</td>
<td width="14%">
C公司
</td><td width="14%">A中心</td>
<td width="5%">david</td><td align="center" width="10%">
2014-2-26
</td><td align="center" width="10%">
2014-3-14<br>
[<font color=deeppink><strong>全<strong></font>]
</td><td align="center" width="7%">
查看
</td><td align="center" width="10%">
<a title="9900014388" href="ConStatusDisp.aspx?CID=a04cdcd2-5b5c-4182-a22d-a29399ab6991">已结束[<font color=red>HOPECE140228-1 </font>][<font color=deeppink><strong>全<strong></font>]</a>
</td><td align="center" width="5%">
1.0
</td>
</tr><tr align="right">
<td colspan="9"><span>1</span> 2 3</td>
</tr>