Searching for strings with BeautifulSoup and write to mongodb - python

I Need reslove 3 problem:
I'm trying to write a simple program in python that parses a web page for the table of table id="dgContract".
store this table of page 1 ,page2,page3....pageN to mongodb, i don't know how to opration mongodb.
parses Selected "Detail" links of content store to mongodb.if open Selected "view" ,need add http://www.xxx.com/ like http://www.xxx.com/LicConDisp.aspx?CID=xxxxx
open picture in new windows will clear!!!
my code:
import urllib2,cookielib,sys
import urllib,string
import cStringIO,Image,re
import BeautifulSoup # For processing HTML
from BeautifulSoup import BeautifulStoneSoup # For processing XML
from BeautifulSoup import BeautifulSoup
import configparser
from pymongo import Connection
import codecs
import sitecustomize
import chardet
host = 'localhost'
database = 'test'
collection = 'compinfo'
f=file('copy of out4.html','r')
html=f.read()
soup = BeautifulSoup(''.join(html))
table = soup.find('table', id="dgContract")
rows = table.findAll('tr')
store = []
for tr in rows:
cols = tr.findAll('td')
row = []
for td in cols:
try:
row.append(''.join(td.find(text=True)))
except Exception:
row.append('')
store.append('|'.join(row))
print '\n'.join(store)
But output like this: open picture in new windows will clear!!!
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" >
<HEAD>
<title>查询</title>
<meta content="Microsoft Visual Studio .NET 7.1" name="GENERATOR">
<meta content="C#" name="CODE_LANGUAGE">
<meta content="JavaScript" name="vs_defaultClientScript">
<meta content="http://schemas.microsoft.com/intellisense/ie5" name="vs_targetSchema">
<meta http-equiv="X-UA-Compatible" content="IE=EmulateIE7" />
<link href="css/user.css" type="text/css" rel="stylesheet">
<style type="text/css">
.STYLE1 {FONT-SIZE: 12px; COLOR: #ffffff}
.STYLE2 {FONT-SIZE: 14px; COLOR: #000000}
.STYLE45 {FONT-SIZE: 12px}
.STYLE51 {FONT-WEIGHT: bold; FONT-SIZE: 12px; FONT-FAMILY: "宋体"}
.STYLE52 {FONT-WEIGHT: bold; FONT-SIZE: 12px; COLOR: #ffffff; FONT-FAMILY: "宋体"}
</style>
</HEAD>
<body background="images/bg.jpg" MS_POSITIONING="GridLayout">
<form name="Form1" method="post" action="ContractSearcher.aspx" id="Form1">
<div align="center">
<table borderColor="#c7c7c7" cellSpacing="0" cellPadding="0" border="1">
<tr>
<td class="tdBorder">
<!-- content -->
<!--显示用户信息条 -->
<!--内容主体:左侧为菜单,右侧为内容显示区 -->
<table height="350" cellSpacing="0" cellPadding="0" width="760" border="0">
<tr>
<!--左侧菜单项 -->
<td width="3"> </td>
<!--右侧内容显示区 -->
<td vAlign="top" width="815" bgColor="#ffffff">
<table width="100%">
<tr>
<td class="tdbigmidcenter">
<table class="tablebigContent" cellspacing="0" rules="all" border="1" id="dgContract" width="815">
<tr bgcolor="PapayaWhip">
<td>numb</td><td>用户1</td><td>用户2</td>
<td>作者</td>
<td align="center">接受时间</td><td align="center">发送</td>
<td align="center">详情</td>
<td align="center">状态</td>
<td>version</td>
</tr>
<tr>
<td width="21%">HOPE-HT-YX-S-140331-120</td><td width="14%">
A公司
</td><td width="14%">A学校</td><td width="5%">david</td><td align="center" width="10%">
2014-3-31
</td><td align="center" width="10%">
未发送
</td><td align="center" width="7%">
查看
</td><td align="center" width="10%">
<a title="" href="ConStatusDisp.aspx?CID=91e13d7a-e812-428d-a5c2-532778ea4e89">已结束[<font color=red>通过</font>]</a>
</td><td align="center" width="5%">
1.0
</td>
</tr><tr>
<td width="21%">HOPE-HT-YX-S-140328-106</td>
<td width="14%">
A公司
</td>
<td width="14%">M公司</td><td width="5%">王明</td><td align="center" width="10%">
2014-3-28
</td><td align="center" width="10%">
未发货
</td><td align="center" width="7%">
查看
</td><td align="center" width="10%">
<a title="" href="ConStatusDisp.aspx?CID=72648278-dbe3-4577-9154-23182e349b33">已结束[<font color=red>HOPECE140328-5 </font>]</a>
</td><td align="center" width="5%">
1.0
</td>
</tr><tr>
<td width="21%"> </td>
<td width="14%">
B公司
</td>
<td width="14%">C中心</td><td width="5%">王明</td><td align="center" width="10%">
2014-3-12
</td><td align="center" width="10%">
2014-3-28<br>
[<font color=deeppink><strong>全<strong></font>]
</td><td align="center" width="7%">
查看
</td><td align="center" width="10%">
<a title="9900014479" href="ConStatusDisp.aspx?CID=0526a587-85dc-484e-88f4-87967546678f">已结束[<font color=red>HOPETE140313-1 </font>][<font color=deeppink><strong>全<strong></font>]</a>
</td><td align="center" width="5%">
1.0
</td>
</tr><tr>
<td width="21%">HOPE-HT-YX-S-140306-001</td>
<td width="14%">
A公司
</td>
<td width="14%">A中心</td>
<td width="5%">JACK</td><td align="center" width="10%">
2014-3-7
</td><td align="center" width="10%">
2014-3-28<br>
[<font color=deeppink><strong>全<strong></font>]
</td><td align="center" width="7%">
查看
</td><td align="center" width="10%">
<a title="9900014479" href="ConStatusDisp.aspx?CID=dfec1630-e1d4-478c-9feb-415eedbd6184">已结束[<font color=red>HOPETE140317-4 </font>][<font color=deeppink><strong>全<strong></font>]</a>
</td><td align="center" width="5%">
1.0
</td>
</tr><tr>
<td width="21%">HOPE-HT-YX-S-140228-102</td>
<td width="14%">
G公司
</td>
<td width="14%">F公司</td>
<td width="5%">david</td><td align="center" width="10%">
未通过
</td><td align="center" width="10%">
未发货
</td><td align="center" width="7%">
查看
</td><td align="center" width="10%">
<a title="" href="ConStatusDisp.aspx?CID=9e19e1c9-7644-4392-9bdd-89e2bac346cd">已作废</a>
</td><td align="center" width="5%">
1.0
</td>
</tr><tr>
<td width="21%">HOPE-HT-YX-S-140228-005</td>
<td width="14%">
T公司
</td>
<td width="14%">J公司 </td>
<td width="5%">jack</td><td align="center" width="10%">
2014-2-28
</td><td align="center" width="10%">
2014-3-28<br>
[<font color=deeppink><strong>全<strong></font>]
</td><td align="center" width="7%">
查看
</td><td align="center" width="10%">
<a title="9900014480" href="ConStatusDisp.aspx?CID=45039bfb-ccb8-49f4-b8fe-27bc8cf59803">已结束[<font color=red>HOPECE140228-10</font>][<font color=deeppink><strong>全<strong></font>]</a>
</td><td align="center" width="5%">
1.0
</td>
</tr><tr>
<td width="21%">HOPE-HT-YX-S-140228-002</td>
<td width="14%">
S公司
</td>
<td width="14%">V公司</td>
<td width="5%">张军</td><td align="center" width="10%">
2014-2-28
</td><td align="center" width="10%">
2014-3-28<br>
[<font color=deeppink><strong>全<strong></font>]
</td><td align="center" width="7%">
查看
</td><td align="center" width="10%">
<a title="9900014479" href="ConStatusDisp.aspx?CID=02a8d406-a826-4a5a-b466-f4bca2640307">已结束[<font color=red>HOPETE140307-4 </font>][<font color=deeppink><strong>全<strong></font>]</a>
</td><td align="center" width="5%">
1.0
</td>
</tr><tr>
<td width="21%"> </td>
<td width="14%">
A公司
</td>
<td width="14%">W公司</td><td width="5%">jack</td><td align="center" width="10%">
2014-2-28
</td><td align="center" width="10%">
2014-3-28<br>
[<font color=deeppink><strong>全<strong></font>]
</td><td align="center" width="7%">
查看
</td><td align="center" width="10%">
<a title="9900014479" href="ConStatusDisp.aspx?CID=2684c70a-baea-4da4-911b-19cdbe627fd9">已结束[<font color=red>HOPETE140307-3 </font>][<font color=deeppink><strong>全<strong></font>]</a>
</td><td align="center" width="5%">
1.0
</td>
</tr><tr>
<td width="21%">HOPE-HT-YX-S-140228-013</td>
<td width="14%">
B公司
</td><td width="14%">V公司</td>
<td width="5%">rose</td><td align="center" width="10%">
2014-2-28
</td><td align="center" width="10%">
2014-3-28<br>
[<font color=deeppink><strong>全<strong></font>]
</td><td align="center" width="7%">
查看
</td><td align="center" width="10%">
<a title="9900014479" href="ConStatusDisp.aspx?CID=1204ad5e-4552-43af-a650-19b93f9d2514">已结束[<font color=red>HOPETE140307-2 </font>][<font color=deeppink><strong>全<strong></font>]</a>
</td><td align="center" width="5%">
1.0
</td>
</tr><tr>
<td width="21%">HOPE-HT-YX-S-140226-018</td>
<td width="14%">
C公司
</td><td width="14%">A中心</td>
<td width="5%">david</td><td align="center" width="10%">
2014-2-26
</td><td align="center" width="10%">
2014-3-14<br>
[<font color=deeppink><strong>全<strong></font>]
</td><td align="center" width="7%">
查看
</td><td align="center" width="10%">
<a title="9900014388" href="ConStatusDisp.aspx?CID=a04cdcd2-5b5c-4182-a22d-a29399ab6991">已结束[<font color=red>HOPECE140228-1 </font>][<font color=deeppink><strong>全<strong></font>]</a>
</td><td align="center" width="5%">
1.0
</td>
</tr><tr align="right">
<td colspan="9"><span>1</span> 2 3</td>
</tr>

Related

Page Number and Total Pages in Header When Printing HTML to PDF

Background: I have a large HTML file that has 8 different pages. Some of the pages in the HTML can be larger than the 11in container size and the 11in stipulated in the #page CSS due to a lot of data in some of the tables.
What am I trying to do?: I am trying to send in context data (written in Django / Python) to each table of unknown length. Once the data has been entered then I will use weasyprint to create the pdf. At the top of every page the page number and total number of pages should be added dynamically.
The issue: When I print to PDF the header on the pages with a lot of rows (ones that are >11in) add the header but the header shows the same page number for of the split pages. In the example below it is on page 2 that is split into two pages and when you print to pdf the header on page 3 and 4 are incorrect.
What have I tried?:
Basically everything I could think of. At first I thought about just using paged media but I couldn't figure out how to put this complex of a header on the pages using that method.
Is it possible with just HTML and CSS to do what I want? If it isn't then I may have to figure out a way to add the headers using JS and then saving the HTML before sending it into weasyprint (since weasyprint doesn't support JS). Any suggestions would be appreciated.
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<style>
*{
margin: 0;
padding: 0;
}
#page{
size: 8.5in 11in;
}
table.container{
page-break-after: always;
}
td{
padding: 0;
margin: 0;
}
table tbody tr{
vertical-align: top;
page-break-after: always;
}
.container{
height: 11in;
width: 8.5in;
border: 1px solid black;
padding: 10px;
margin: 10px auto ;
}
.container thead{
height: 225px;
vertical-align:top;
}
.top{
display: flex;
align-items:center;
}
.header{
page-break-before: always;
}
.bottom{
font-family: sans-serif;
font-size: 10px;
}
.main{
margin: 0 30px;
}
.thead{
display: flex;
background-color: rgb(123, 199, 157);
color: white;
font-size: 14px;
border-bottom: 1px solid black;
border: 1px solid black;
}
.thead .col-one{
flex:3;
}
.thead .col-two{
flex: 1;
}
.thead .col-three{
flex: 2;
text-align: center;
}
.thead p{
padding: 10px;
}
.tbody{
display: flex;
color: rgb(23, 184, 109);
font-size: 12px;
font-family: sans-serif;
}
.main-data{
padding: 10px 15px ;
}
.sub-data{
padding: 0 30px 0 ;
}
.test i{
padding-top: 13.3px ;
}
.result i{
padding-top: 10px;
}
.result p{
padding: 1px;
}
.tbody tr td:nth-child(2), tr td:nth-child(3){
padding-left: 230px;
}
.tbody td{
padding: 2px 0;
}
.tbody{
border: 1px solid black;
}
.entry p{
color: rgb(78, 208, 143);
}
.entry{
font-family: sans-serif;
font-size: 13px;
padding: 50px 0 0 ;
}
.sub{
padding: 5px;
}
body{
counter-reset: page pages my-counter 0;
}
.header{
display: table-header-group;
}
.footer{
display: table-footer-group;
}
#media print{
.tbody tr td:nth-child(2), tr td:nth-child(3){
padding-left: 220px;
}
.thead{
display: table-header-group
color: black;
background-color: rgb(69, 213, 129);
}
.thead tr { page-break-inside: avoid; }
}
#page {
#bottom-right {
content: counter(page) " of " counter(pages);
}
}
.dot::after {
content: " : "counter(page) " of " counter(pages);
counter-increment: page 1;
}
</style>
</head>
<body>
<table class="container">
<thead>
<tr>
<td>
<div class="haeder">
<div class="top">
<div class="address">
<h2>Company title</h2>
<p>contact info</p>
</div>
<div class="customer">
<p>Report ID: </p>
</div>
</div>
<div class="head-line">
<h1>Document Title</h1>
<p>Page No<span class="dot"></span> </p>
</div>
<div class="details">
<div class="lab-info">
<p>Info: <span>Lab Number</span></p>
<p>Info: <span>Sample ID from Sample Received</span></p>
</div>
<div class="date">
<p>Date Received: </p>
<p>Date Reported: </p>
</div>
</div>
</td>
</tr>
</thead>
<tfoot>
<tr>
<td>
<div class="footer" >
<div class="bottom">
<div class="first-col">
<p>Some Info</p>
<p>Some more info</p>
</div>
</div>
</div>
</td>
</tr>
</tfoot>
<tbody>
<tr>
<td>
<div class="main" >
<div class="thead">
<p class="col-one" >Table Name</p>
<p class="col-two">Result</p>
<p class="col-three">Result 2</p>
</div>
<div class="tbody">
<div class="main">
<table>
<tr>
<td>Test</td>
<td></td>
<td></td>
</tr>
<tr>
<td class="sup">Test</td>
<td> %</td>
</tr>
<tr>
<td class="sup">Test</td>
<td>%</td>
</tr>
<tr>
<td class="sup">Test</td>
<td>%</td>
</tr>
<tr>
<td class="sup">Test</td>
<td>%</td>
</tr>
<tr>
<td class="sup">Test</td>
<td>%</td>
</tr>
<tr>
<td class="sup">Test</td>
<td>%</td>
</tr>
<tr>
<td class="sup">Test</td>
<td></td>
<td></td>
</tr>
</table>
</div>
<div class="result">
</div>
<div class="test">
</div>
</div>
</div>
</td>
</tr>
</tbody>
</table>
<table class="container">
<thead>
<tr>
<td>
<div class="haeder">
<div class="top">
<div class="address">
<h2>Company title</h2>
<p>contact info</p>
</div>
<div class="customer">
<p>Report ID: </p>
</div>
</div>
<div class="head-line">
<h1>Document Title</h1>
<p>Page No<span class="dot"</p>
</div>
<div class="details">
<div class="lab-info">
<p>Info: <span>Lab Number</span></p>
<p>Info: <span>Sample ID from Sample Received</span></p>
</div>
<div class="date">
<p>Date Received: </p>
<p>Date Reported: </p>
</div>
</div>
</td>
</tr>
</thead>
<tfoot>
<tr>
<td>
<div class="footer" >
<div class="bottom">
<div class="first-col">
<p>Some Info</p>
<p>Some more info</p>
</div>
</div>
</div>
</td>
</tr>
</tfoot>
<tbody>
<tr>
<td>
<div class="main" >
<div class="thead">
<p class="col-one" >Table Name</p>
<p class="col-two">Result</p>
<p class="col-three">Result 2</p>
</div>
<div class="tbody">
<div class="main">
<table>
<tr>
<td>Test</td>
<td></td>
<td></td>
</tr>
<tr>
<td class="sup">Test</td>
<td> %</td>
</tr>
<tr>
<td class="sup">Test</td>
<td>%</td>
</tr>
<tr>
<td class="sup">Test</td>
<td>%</td>
</tr>
<tr>
<td class="sup">Test</td>
<td>%</td>
</tr>
<tr>
<td class="sup">Test</td>
<td> %</td>
</tr>
<tr>
<td class="sup">Test</td>
<td>%</td>
</tr>
<tr>
<td class="sup">Test</td>
<td>%</td>
</tr>
<tr>
<td class="sup">Test</td>
<td>%</td>
</tr>
<tr>
<td class="sup">Test</td>
<td> %</td>
</tr>
<tr>
<td class="sup">Test</td>
<td>%</td>
</tr>
<tr>
<td class="sup">Test</td>
<td>%</td>
</tr>
<tr>
<td class="sup">Test</td>
<td>%</td>
</tr>
<tr>
<td class="sup">Test</td>
<td> %</td>
</tr>
<tr>
<td class="sup">Test</td>
<td>%</td>
</tr>
<tr>
<td class="sup">Test</td>
<td>%</td>
</tr>
<tr>
<td class="sup">Test</td>
<td>%</td>
</tr>
<tr>
<td class="sup">Test</td>
<td> %</td>
</tr>
<tr>
<td class="sup">Test</td>
<td>%</td>
</tr>
<tr>
<td class="sup">Test</td>
<td>%</td>
</tr>
<tr>
<td class="sup">Test</td>
<td>%</td>
</tr>
<tr>
<td class="sup">Test</td>
<td> %</td>
</tr>
<tr>
<td class="sup">Test</td>
<td>%</td>
</tr>
<tr>
<td class="sup">Test</td>
<td>%</td>
</tr>
<tr>
<td class="sup">Test</td>
<td>%</td>
</tr>
<tr>
<td class="sup">Test</td>
<td> %</td>
</tr>
<tr>
<td class="sup">Test</td>
<td>%</td>
</tr>
<tr>
<td class="sup">Test</td>
<td>%</td>
</tr>
<tr>
<td class="sup">Test</td>
<td>%</td>
</tr>
<tr>
<td class="sup">Test</td>
<td> %</td>
</tr>
<tr>
<td class="sup">Test</td>
<td>%</td>
</tr>
<tr>
<td class="sup">Test</td>
<td>%</td>
</tr>
<tr>
<td class="sup">Test</td>
<td>%</td>
</tr>
<tr>
<td class="sup">Test</td>
<td> %</td>
</tr>
<tr>
<td class="sup">Test</td>
<td>%</td>
</tr>
<tr>
<td class="sup">Test</td>
<td>%</td>
</tr>
<tr>
<td class="sup">Test</td>
<td>%</td>
</tr>
<tr>
<td class="sup">Test</td>
<td> %</td>
</tr>
<tr>
<td class="sup">Test</td>
<td>%</td>
</tr>
<tr>
<td class="sup">Test</td>
<td>%</td>
</tr>
<tr>
<td class="sup">Test</td>
<td>%</td>
</tr>
<tr>
<td class="sup">Test</td>
<td> %</td>
</tr>
<tr>
<td class="sup">Test</td>
<td>%</td>
</tr>
<tr>
<td class="sup">Test</td>
<td>%</td>
</tr>
<tr>
<td class="sup">Test</td>
<td>%</td>
</tr>
<tr>
<td class="sup">Test</td>
<td> %</td>
</tr>
<tr>
<td class="sup">Test</td>
<td>%</td>
</tr>
<tr>
<td class="sup">Test</td>
<td>%</td>
</tr>
<tr>
<td class="sup">Test</td>
<td>%</td>
</tr>
<tr>
<td class="sup">Test</td>
<td> %</td>
</tr>
<tr>
<td class="sup">Test</td>
<td>%</td>
</tr>
<tr>
<td class="sup">Test</td>
<td>%</td>
</tr>
<tr>
<td class="sup">Test</td>
<td>%</td>
</tr>
<tr>
<td class="sup">Test</td>
<td> %</td>
</tr>
<tr>
<td class="sup">Test</td>
<td>%</td>
</tr>
<tr>
<td class="sup">Test</td>
<td>%</td>
</tr>
<tr>
<td class="sup">Test</td>
<td>%</td>
</tr>
<tr>
<td class="sup">Test</td>
<td> %</td>
</tr>
<tr>
<td class="sup">Test</td>
<td>%</td>
</tr>
<tr>
<td class="sup">Test</td>
<td>%</td>
</tr>
<tr>
<td class="sup">Test</td>
<td>%</td>
</tr>
<tr>
<td class="sup">Test</td>
<td> %</td>
</tr>
<tr>
<td class="sup">Test</td>
<td>%</td>
</tr>
<tr>
<td class="sup">Test</td>
<td>%</td>
</tr>
<tr>
<td class="sup">Test</td>
<td>%</td>
</tr>
<tr>
<td class="sup">Test</td>
<td> %</td>
</tr>
<tr>
<td class="sup">Test</td>
<td>%</td>
</tr>
<tr>
<td class="sup">Test</td>
<td>%</td>
</tr>
<tr>
<td class="sup">Test</td>
<td>%</td>
</tr>
<tr>
<td class="sup">Test</td>
<td> %</td>
</tr>
<tr>
<td class="sup">Test</td>
<td>%</td>
</tr>
<tr>
<td class="sup">Test</td>
<td>%</td>
</tr>
<tr>
<td class="sup">Test</td>
<td>%</td>
</tr>
<tr>
<td class="sup">Test</td>
<td>%</td>
</tr>
<tr>
<td class="sup">Test</td>
<td>%</td>
<td></td>
</tr>
<tr>
<td class="sup">Test</td>
<td></td>
<td></td>
</tr>
</table>
</div>
<div class="result">
</div>
<div class="test">
</div>
</div>
</div>
</td>
</tr>
</tbody>
</table>
</body>
</html>

How to extract the following lines after pattern match

the web source is like this:
<div class="MT12">
<table class="tblchart" border="0" cellspacing="0" cellpadding="0">
<tr>
<th rowspan="2" width="100" align="left" valign="top">Date</th>
<th rowspan="2" width="100" style="text-align:right;" valign="top">Open</th>
<th rowspan="2" width="100" style="text-align:right;" valign="top">High</th>
<th rowspan="2" width="100" style="text-align:right;" valign="top">Low</th>
<th rowspan="2" width="100" style="text-align:right;" valign="top">Close</th>
<th colspan="2" style="text-align:center;" valign="top">- SPREAD -</th>
</tr>
<tr>
<th width="100" style="text-align:right;" valign="top">(High-Low)</th>
<th width="100" style="text-align:right;" valign="top" class="last">(Open-Close)</th>
</tr>
<tr>
<td align="left" valign="top">2019-12-24</td>
<td valign="top" style="text-align:right;">12269.25</td>
<td valign="top" class="b_12vv" style="text-align:right">12283.70</td>
<td valign="top" style="text-align:right;">12202.10</td>
<td valign="top" style="text-align:right;">12214.55</td>
<td valign="top" style="text-align:right;">81.60</td>
<td align="right" valign="top" class="last" style="text-align:right;">54.70</td>
</tr>
<tr>
<td align="left" valign="top">2019-12-23</td>
<td valign="top" style="text-align:right;">12235.45</td>
<td valign="top" class="b_12vv" style="text-align:right">12287.15</td>
<td valign="top" style="text-align:right;">12213.25</td>
<td valign="top" style="text-align:right;">12262.75</td>
<td valign="top" style="text-align:right;">73.90</td>
<td align="right" valign="top" class="last" style="text-align:right;">-27.30</td>
</tr>
<tr>
<td align="left" valign="top">2019-12-20</td>
<td valign="top" style="text-align:right;">12266.45</td>
<td valign="top" class="b_12vv" style="text-align:right">12293.90</td>
<td valign="top" style="text-align:right;">12252.75</td>
<td valign="top" style="text-align:right;">12271.80</td>
<td valign="top" style="text-align:right;">41.15</td>
<td align="right" valign="top" class="last" style="text-align:right;">-5.35</td>
</tr>
</table>
</div>
I want to get the following numbers for every date:
say for example I have to get the numbers 12269.25, 12283.70, 12202.10 and 12214.55 for a particular date (2019-12-24). Then proceed for the next date given.
I am facing difficulty because I need to select next 4 lines(whose xpath is not exatly related much as shown above) following each date in the page. The dates can range from single date to 100-200 dates.
Can anybody please help with webdriver code snippet for the same.
Thanks a lot
Can this meet your needs
from simplified_scrapy.simplified_doc import SimplifiedDoc
html = '''<div class="MT12">
<table class="tblchart" border="0" cellspacing="0" cellpadding="0">
<tr>
<th rowspan="2" width="100" align="left" valign="top">Date</th>
<th rowspan="2" width="100" style="text-align:right;" valign="top">Open</th>
<th rowspan="2" width="100" style="text-align:right;" valign="top">High</th>
<th rowspan="2" width="100" style="text-align:right;" valign="top">Low</th>
<th rowspan="2" width="100" style="text-align:right;" valign="top">Close</th>
<th colspan="2" style="text-align:center;" valign="top">- SPREAD -</th>
</tr>
<tr>
<th width="100" style="text-align:right;" valign="top">(High-Low)</th>
<th width="100" style="text-align:right;" valign="top" class="last">(Open-Close)</th>
</tr>
<tr>
<td align="left" valign="top">2019-12-24</td>
<td valign="top" style="text-align:right;">12269.25</td>
<td valign="top" class="b_12vv" style="text-align:right">12283.70</td>
<td valign="top" style="text-align:right;">12202.10</td>
<td valign="top" style="text-align:right;">12214.55</td>
<td valign="top" style="text-align:right;">81.60</td>
<td align="right" valign="top" class="last" style="text-align:right;">54.70</td>
</tr>
<tr>
<td align="left" valign="top">2019-12-23</td>
<td valign="top" style="text-align:right;">12235.45</td>
<td valign="top" class="b_12vv" style="text-align:right">12287.15</td>
<td valign="top" style="text-align:right;">12213.25</td>
<td valign="top" style="text-align:right;">12262.75</td>
<td valign="top" style="text-align:right;">73.90</td>
<td align="right" valign="top" class="last" style="text-align:right;">-27.30</td>
</tr>
<tr>
<td align="left" valign="top">2019-12-20</td>
<td valign="top" style="text-align:right;">12266.45</td>
<td valign="top" class="b_12vv" style="text-align:right">12293.90</td>
<td valign="top" style="text-align:right;">12252.75</td>
<td valign="top" style="text-align:right;">12271.80</td>
<td valign="top" style="text-align:right;">41.15</td>
<td align="right" valign="top" class="last" style="text-align:right;">-5.35</td>
</tr>
</table>
</div>'''
doc = SimplifiedDoc(html)
table = doc.getElement(tag='table',value='tblchart')
trs = table.trs.notContains('<th') # get tr
for tr in trs:
tds = tr.tds # get all td
data = [td.text for td in tds]
print (data[0],data[1],data[2],data[3],data[4])

Extract specific value using Beautifulsoap

I was scraping this website (https://www.ivolatility.com/options/RVX/) using python module request. The output from the selection of the first table using beautifulsoap is above. Now, inside of this first table I am trying to get a specific value (19.17) from this soup obtained from python module requests.
I would like to achieve it using Beautifulsoap, I don't know how to specifically select the cell where it is saved.
Do any of you have any suggestions?
Output from requests:
<table border="0" bordercolor="red" cellpadding="0" cellspacing="0" width="100%">
<tr>
<td colspan="3"><script language="JavaScript">
function submitCalcForm(event) {
event.preventDefault();
var form = document.getElementById('basicOptionsForm');
var action = form.action;
var regions = ['', 'USA', 'Europe', 'Asia', 'Canada'];
var regionsOptions = form[1];
var selectedRegion = regionsOptions.options[regionsOptions.selectedIndex].value;
var symbol = form[0].value.trim();
var location = (window.location.href.indexOf('.j')>-1)
? (form.action + '?' + form[0].name + '=' + form[0].value + '&' + form[1].name + '=' + selectedRegion)
: ('/options/'+ ((symbol == '') ? '-' : symbol ) +'/'+regions[selectedRegion]);
window.location.href= location;
}
function goToLookup() {
window.location.href= "/options/-/";
}
</script>
<form action="/options.j" id="basicOptionsForm" method="get" onsubmit="submitCalcForm(event);">
<table bgcolor="#ffffff" border="0" cellpadding="0" cellspacing="0">
<tr>
<td>
<table bgcolor="#999999" border="0" cellpadding="0" cellspacing="1">
<tr>
<td bgcolor="#567abb">
<table border="0" cellpadding="1" cellspacing="0" class="table-action">
<tr>
<td><span class="s1w" style="color: #fff;"> Symbol: </span></td><td><input class="s2" name="ticker" size="5" type="text" value="RVX"/></td><td><select class="s2" name="R"><option selected="" value="0">
ALL
</option><option value="1">
USA
</option><option value="2">
Europe
</option><option value="4">
Canada
</option></select></td><td><span class="s2"> </span></td><td><button style="background: #0C6EF8; font-weight: bold; border: 1px solid black;" type="submit">GO!</button></td><td><span class="s2"> </span></td><td><button onclick="goToLookup();" style="background: #0C6EF8; font-weight: bold; border: 1px solid black; color: white; white-space: nowrap;" type="button">
Symbol Lookup</button></td><td><span class="s2"> </span></td>
</tr>
</table>
</td>
</tr>
</table>
</td><td><img border="0" height="1" src="/design/images/0.gif" width="5"/></td><td nowrap=""><b><span class="s4">Russell 2000 Volatility Index</span></b></td><td width="100%"> </td>
</tr>
</table>
</form>
</td>
</tr>
<tr>
<td colspan="3"><img alt="." border="0" height="10" src="/design/images/0.gif" width="1"/></td>
</tr>
<tr valign="top">
<td width="100%"><script type="text/javascript">
<!--
function wr(s) {
document.write(s);
}
var d = new Array(10);
d[20]='N/A';d[25]='-94.06%';d[30]='32.03%';d[35]='34.74';d[56]='N/A';d[61]='N/A';d[66]='10-Apr';d[71]='84.49%';d[97]='N/A';d[102]='03-Oct';d[107]='29-Mar';d[112]='1.43';d[133]='N/A';d[138]='N/A';d[143]='148.97%';d[148]='98.46%';d[174]='N/A';d[179]='-46.88%';d[184]='198.21%';d[189]='0.27';d[210]='N/A';d[215]='N/A';d[220]='25-May';d[225]='110.30%';d[251]='N/A';d[256]='-68.76%';d[261]='75.38%';d[266]='0';d[287]='N/A';d[292]='N/A';d[297]='39.85%';d[302]='120.02%';d[328]='N/A';d[333]='-67.09%';d[338]='69.94%';d[343]='19.17';d[364]='N/A';d[369]='N/A';d[374]='06-Apr';d[379]='06/14/2018';d[405]='N/A';d[410]='-82.49%';d[415]='74.41%';d[441]='N/A';d[446]='N/A';d[451]='164.16%';d[456]='12.93';d[482]='N/A';d[487]='24-May';d[492]='77.70%';d[518]='N/A';d[523]='03-May';d[528]='21-May';d[533]='12/24/2018';d[559]='N/A';d[564]='59.42%';d[569]='84.78%';
wr('<table class="table-data" cellpadding=1 cellspacing=1 border=0 width=100%>');
wr('<tr bgcolor="#cccccc" align=right height=20>');
wr('<td align="center"><font class=s1>Price</font></td>');
wr('<td align="center"><font class=s1>Change (%)</font><img src="/design/images/0.gif" width=4 height=1 border=0/></td>');
wr('<td align="center"><font class=s1>52 wk High</font><img src="/design/images/0.gif" width=4 height=1 border=0/></td>');
wr('<td align="center"><font class=s1>52 wk Low</font><img src="/design/images/0.gif" width=4 height=1 border=0/></td>');
wr('<td align="center"><font class=s1>Stock volume</font>');
wr('<a href="javascript:openHelp(14)" alt="Open Help">');
wr('<img src="/design/images/ico/q_zn.gif" width=8 height=10 border=0 alt="Open Help"/>');
wr('</a><img src="/design/images/0.gif" width=4 height=1 border=0/></td>');
wr('</tr>');
wr('<tr bgcolor="#FFFFFF" align=right height=20>');
wr('<td align="center"><font class=s1>');
wr(d[343]);
wr('</font></td>');
wr('<td align="center"><font class=s1><nobr> ');
wr('<img src="/design/images/ico/up.gif" alt="+" border=0 align="absmiddle" width=7 height=9/> +');
wr(d[189]);
wr(' (+');
wr(d[112]);
wr('%)</nobr></font></td>');
wr('<td align="center"><font class=s1><nobr> ');
wr(d[35]);
wr(' ');
wr(d[533]);
wr('</nobr></font></td><td align="center"><font class=s1><nobr> ');
wr(d[456]);
wr(' ');
wr(d[379]);
wr('</nobr></font></td>');
wr('<td align="center"><font size=-2 class=s1>');
wr(d[266]);
wr('</font></td>');
wr('</tr></table>');
//-->
</script><img border="0" height="10" src="/design/images/0.gif" width="1"/><table border="0" cellpadding="0" cellspacing="0" class="table-data" width="100%">
<tr align="center" bgcolor="
#cccccc
" height="20">
<td align="center" colspan="2"><font class="s2">Current</font></td><td><font class="s2">1 WK AGO</font></td><td><font class="s2">1 MO AGO</font></td><td><font class="s2">52 wk Hi/Date</font></td><td><font class="s2">52 wk Low/Date</font></td>
</tr>
<tr>
<td align="center" bgcolor="
#FFFFFF
" colspan="5" height="20"><font class="s2" color="">  HISTORICAL VOLATILITY <a alt="Open Help" href="javascript:openHelp(4)"><img alt="Open Help" border="0" height="10" src="/design/images/ico/q_zn.gif" width="8"/></a></font></td>
</tr>
<tr align="center" bgcolor="#ffffff">
<td align="right"><font class="s2">10 days</font></td><td><font class="s2">120.02%</font></td><td><font class="s2">84.49%</font></td><td><font class="s2">74.41%</font></td><td><font class="s2">198.21% - 29-Mar</font></td><td><font class="s2">32.03% - 21-May</font></td>
</tr>
<tr align="center" bgcolor="#eeeeee">
<td align="right"><font class="s2">20 days</font></td><td><font class="s2">110.30%</font></td><td><font class="s2">84.78%</font></td><td><font class="s2">69.94%</font></td><td><font class="s2">164.16% - 06-Apr</font></td><td><font class="s2">39.85% - 25-May</font></td>
</tr>
<tr align="center" bgcolor="#ffffff">
<td align="right"><font class="s2">30 days</font></td><td><font class="s2">98.46%</font></td><td><font class="s2">77.70%</font></td><td><font class="s2">75.38%</font></td><td><font class="s2">148.97% - 10-Apr</font></td><td><font class="s2">59.42% - 24-May</font></td>
</tr>
<tr>
<td align="center" bgcolor="
#FFFFFF
" colspan="5" height="20"><font class="s2" color="">  IMPLIED VOLATILITY <img alt="Open Help" border="0" height="10" src="/design/images/ico/q_zn.gif" width="8"/></font></td>
</tr>
<tr align="center" bgcolor="#ffffff">
<td align="right"><font class="s2">IV Index call <img alt="Open Help" border="0" height="10" src="/design/images/ico/q_zn.gif" width="8"/></font></td><td><font class="s2">N/A</font></td><td><font class="s2">N/A</font></td><td><font class="s2">N/A</font></td><td><font class="s2">N/A - N/A</font></td><td><font class="s2">N/A - N/A</font></td>
</tr>
<tr align="center" bgcolor="#eeeeee">
<td align="right"><font class="s2">IV Index put <img alt="Open Help" border="0" height="10" src="/design/images/ico/q_zn.gif" width="8"/></font></td><td><font class="s2">N/A</font></td><td><font class="s2">N/A</font></td><td><font class="s2">N/A</font></td><td><font class="s2">N/A - N/A</font></td><td><font class="s2">N/A - N/A</font></td>
</tr>
<tr align="center" bgcolor="#ffffff">
<td align="right"><font class="s2">IV Index mean <img alt="Open Help" border="0" height="10" src="/design/images/ico/q_zn.gif" width="8"/></font></td><td><font class="s2">N/A</font></td><td><font class="s2">N/A</font></td><td><font class="s2">N/A</font></td><td><font class="s2">N/A - N/A</font></td><td><font class="s2">N/A - N/A</font></td>
</tr>
<tr>
<td align="center" bgcolor="
#FFFFFF
" colspan="5" height="20"><font class="s2" color="">HISTORICAL 30-DAYS CORRELATION AGAINST S&P 500 Index (SPX)<img alt="Open Help" border="0" height="10" src="/design/images/ico/q_zn.gif" width="8"/></font></td>
</tr>
<tr align="center" bgcolor="#ffffff">
<td align="right"><font class="s2">30 days</font></td><td><font class="s2">-82.49%</font></td><td><font class="s2">-67.09%</font></td><td><font class="s2">-68.76%</font></td><td><font class="s2">-46.88% - 03-Oct</font></td><td><font class="s2">-94.06% - 03-May</font></td>
</tr>
</table>
</td>
</tr>
</table>
The page is dynamic so you'd need to render the page first with something like Selenium.
Also, you can use BeautfifulSoup, or even Selenium, to parse the html once you have it. But I noticed that it's located within <table> tags. Whenever I see a <table> tag, I usually opt to go with pandas' .read_html() as it'll do the hard work for you.
.read_html() will return a list of dataframes, then it's just a matter of finding the data you want, or maniupulate the table as needed. The data you want was found in the dataframe in index position 4, (it was also in position 0, but I choose to go with 4 since it was right there, 2nd row, first column). Then just slice that dataframe to get hat specific cell:
from selenium import webdriver
import pandas as pd
driver = webdriver.Chrome('C:/chromedriver_win32/chromedriver.exe')
url = 'https://www.ivolatility.com/options/RVX/'
driver.get(url)
tables = pd.read_html(driver.page_source)
price = tables[4][0][1]
driver.close()
Output:
print (price)
19.17

Beautifulsoup results to pandas dataframe

The below code returns me a table with the following results
r = requests.get(url)
soup = bs4.BeautifulSoup(r.text, 'lxml')
mylist = soup.find(attrs={'class': 'table_grey_border'})
print(mylist)
results - it stretches on for 1700 rows
<table cellpadding="0" cellspacing="2" class="table_grey_border" width="100%">
<tr valign="top">
<td class="verd_black12" width="18%"><b>STOCK CODE</b></td>
<td class="verd_black12" width="42%"><b>NAME OF LISTED SECURITIES</b></td>
<td class="verd_black12" width="19%"><b>BOARD LOT</b></td>
<td class="verd_black12" colspan="4" width="12%"><b>REMARK</b></td>
</tr>
<tr class="tr_normal">
<td class="verd_black12" width="18%">00001</td>
<td class="verd_black12" width="42%">CKH HOLDINGS</td>
<td class="verd_black12" width="19%">500</td>
<td align="center" class="verd_black12" width="3%">#</td>
<td align="center" class="verd_black12" width="3%">H</td>
<td align="center" class="verd_black12" width="3%">O</td>
<td align="center" class="verd_black12" width="3%">F</td>
</tr>
<tr class="tr_normal">
<td class="verd_black12" width="18%">00002</td>
<td class="verd_black12" width="42%">CLP HOLDINGS</td>
<td class="verd_black12" width="19%">500</td>
<td align="center" class="verd_black12" width="3%">#</td>
<td align="center" class="verd_black12" width="3%">H</td>
<td align="center" class="verd_black12" width="3%">O</td>
<td align="center" class="verd_black12" width="3%">F</td>
</tr>
...
My question is, how do I put each of these rows into Pandas Dataframe? I tried the below code, but i'm returned with an error
a = pandas.read_html(mylist)
print(a)
error
TypeError: 'NoneType' object is not callable
Document:
pandas.read_html(url, attrs={'class': 'table_grey_border'})

How to extract using beautifulsoup python [duplicate]

This question already has answers here:
python beautifulsoup extracting text
(2 answers)
Closed 9 years ago.
I am only interested to use beautifulsoup to extract all the value of 3-hr PSI Readings from 12AM to 11.59PM. Such as the latest bold text of 82 at 5pm.
Example of website is at http://app2.nea.gov.sg/anti-pollution-radiation-protection/air-pollution/psi/psi-readings-over-the-last-24-hours. Can anyone teach me how ? Thanks in advance !
<!-- start content -->
<h1 class="title" id="top">
PSI Readings over the last 24 Hours</h1>
<script type="text/javascript">
var baseUrl = '/anti-pollution-radiation-protection/air-pollution/psi/psi-readings-over-the-last-24-hours';
function changetime(ddl) {
var strTime = ddl.options[ddl.selectedIndex].value;
if (strTime != null) {
var npage = baseUrl + "/time/" + strTime + "#psi24";
window.location = npage;
}
}
</script>
<h1 id="psi24">
24-hr PSI Readings on 24 Jun 2013
</h1>
<p>
View reading for:
<select class="default" id="ContentPlaceHolderContent_C001_DDLTime" name="ctl00$ContentPlaceHolderContent$C001$DDLTime" onchange="changetime(this);">
<option value="0000">12AM</option>
<option value="0100">1AM</option>
<option value="0200">2AM</option>
<option value="0300">3AM</option>
<option value="0400">4AM</option>
<option value="0500">5AM</option>
<option value="0600">6AM</option>
<option value="0700">7AM</option>
<option value="0800">8AM</option>
<option value="0900">9AM</option>
<option value="1000">10AM</option>
<option value="1100">11AM</option>
<option value="1200">12PM</option>
<option value="1300">1PM</option>
<option value="1400">2PM</option>
<option value="1500">3PM</option>
<option value="1600">4PM</option>
<option selected="selected" value="1700">5PM</option>
</select>
</p>
<table border="0" cellpadding="4" cellspacing="1" class="text_psinormal" width="100%">
<thead>
<tr>
<th width="33%">
<center><strong>Region</strong></center>
</th>
<th width="33%">
<center><strong>PSI</strong></center>
</th>
<th width="34%">
<center><strong>24-hr PM2.5 Concentration (µg/m<sup>3</sup>)</strong></center>
</th>
</tr>
</thead>
<tr>
<td align="center">North
</td>
<td align="center">
61
</td>
<td align="center">
47
</td>
</tr>
<tr>
<td align="center">South
</td>
<td align="center">
62
</td>
<td align="center">
46
</td>
</tr>
<tr>
<td align="center">East
</td>
<td align="center">
55
</td>
<td align="center">
39
</td>
</tr>
<tr>
<td align="center">West
</td>
<td align="center">
87
</td>
<td align="center">
83
</td>
</tr>
<tr>
<td align="center">Central
</td>
<td align="center">
58
</td>
<td align="center">
40
</td>
</tr>
<tr>
<td align="center">Overall Singapore
</td>
<td align="center">
55-87
</td>
<td align="center">
39-83
</td>
</tr>
</table>
<div>
</div>
<div>
<h1>3-hr PSI Readings from 12AM to 11.59PM on
24 Jun 2013</h1>
<table border="0" cellpadding="4" cellspacing="1" width="100%">
<tr>
<td align="center" width="16%">
<strong>Time</strong>
</td>
<td align="center" width="7%"><strong>12AM</strong>
</td>
<td align="center" width="7%"><strong>1AM</strong>
</td>
<td align="center" width="7%"><strong>2AM</strong>
</td>
<td align="center" width="7%"><strong>3AM</strong>
</td>
<td align="center" width="7%"><strong>4AM</strong>
</td>
<td align="center" width="7%"><strong>5AM</strong>
</td>
<td align="center" width="7%"><strong>6AM</strong>
</td>
<td align="center" width="7%"><strong>7AM</strong>
</td>
<td align="center" width="7%"><strong>8AM</strong>
</td>
<td align="center" width="7%"><strong>9AM</strong>
</td>
<td align="center" width="7%"><strong>10AM</strong>
</td>
<td align="center" width="7%"><strong>11AM</strong>
</td>
</tr>
<tr>
<td align="center">
<strong>3-hr PSI</strong>
</td>
<td align="center">
76
</td>
<td align="center">
70
</td>
<td align="center">
64
</td>
<td align="center">
59
</td>
<td align="center">
54
</td>
<td align="center">
51
</td>
<td align="center">
48
</td>
<td align="center">
47
</td>
<td align="center">
47
</td>
<td align="center">
47
</td>
<td align="center">
49
</td>
<td align="center">
52
</td>
</tr>
<tr>
<td align="center" width="16%">
<strong>Time</strong>
</td>
<td align="center" width="7%"><strong>12PM</strong>
</td>
<td align="center" width="7%"><strong>1PM</strong>
</td>
<td align="center" width="7%"><strong>2PM</strong>
</td>
<td align="center" width="7%"><strong>3PM</strong>
</td>
<td align="center" width="7%"><strong>4PM</strong>
</td>
<td align="center" width="7%"><strong>5PM</strong>
</td>
<td align="center" width="7%"><strong>6PM</strong>
</td>
<td align="center" width="7%"><strong>7PM</strong>
</td>
<td align="center" width="7%"><strong>8PM</strong>
</td>
<td align="center" width="7%"><strong>9PM</strong>
</td>
<td align="center" width="7%"><strong>10PM</strong>
</td>
<td align="center" width="7%"><strong>11PM</strong>
</td>
</tr>
<tr>
<td align="center">
<strong>3-hr PSI</strong>
</td>
<td align="center">
54
</td>
<td align="center">
59
</td>
<td align="center">
65
</td>
<td align="center">
72
</td>
<td align="center">
79
</td>
<td align="center">
<strong style="font-size:14px;">82</strong>
</td>
<td align="center">
-
</td>
<td align="center">
-
</td>
<td align="center">
-
</td>
<td align="center">
-
</td>
<td align="center">
-
</td>
<td align="center">
-
</td>
</tr>
</table>
</div>
<div class="sfContentBlock">
<p class="table-caption">Hourly updates of 3-hr PSI readings are provided from 12am to 11:59pm. The 3hr PSI readings are calculated based on PM10 concentrations only</p>
</div>
<div>
</div>
<div class="backToTop">
Back to Top
</div>
</div>
</div>
<!-- end content -->
Though you should have shown that you've tried to do it yourself, but here is the code:
from pprint import pprint
import urllib2
from bs4 import BeautifulSoup as soup
url = "http://app2.nea.gov.sg/anti-pollution-radiation-protection/air-pollution/psi/psi-readings-over-the-last-24-hours"
web_soup = soup(urllib2.urlopen(url))
table = web_soup.find(name="div", attrs={'class': 'c1'}).find_all(name="div")[2].find_all('table')[0]
table_rows = []
for row in table.find_all('tr'):
table_rows.append([td.text.strip() for td in row.find_all('td')])
data = {}
for tr_index, tr in enumerate(table_rows):
if tr_index % 2 == 0:
for td_index, td in enumerate(tr):
data[td] = table_rows[tr_index + 1][td_index]
pprint(data)
prints:
{'10AM': '49',
'10PM': '-',
'11AM': '52',
'11PM': '-',
'12AM': '76',
'12PM': '54',
'1AM': '70',
'1PM': '59',
'2AM': '64',
'2PM': '65',
'3AM': '59',
'3PM': '72',
'4AM': '54',
'4PM': '79',
'5AM': '51',
'5PM': '82',
'6AM': '48',
'6PM': '79',
'7AM': '47',
'7PM': '-',
'8AM': '47',
'8PM': '-',
'9AM': '47',
'9PM': '-',
'Time': '3-hr PSI'}

Categories