How to scrape specfic tr or td from a table python

How to scrape specfic tr or td from a table python - python

I want to scrape the FirstName and the LastName of this website to use it on a automated browser input.
from lxml import html
import requests
page = requests.get('https://www.getnewidentity.com/uk-identity-generator.php')
tree = html.fromstring(page.content)
firstname = tree.xpath('//*[#id="reslist"]/tbody/tr[3]/td[2]/text()')
lastname = tree.xpath('//*[#id="reslist"]/tbody/tr[4]/td[2]/text()')
print ('FirstName: ', firstname)
print ('LastName: ', lastname)
input("close")
The website is this https://www.getnewidentity.com/uk-identity-generator.php
<table class="table table-bordered table-striped" id="reslist"><thead><tr><th colspan="2" class="bg-primary">General Information</th></tr></thead><tbody><tr><td style="width:150px;">Name</td><td><b>Kamila Harmon</b></td></tr>
<tr><td>Gender</td><td>Female</td></tr>
<tr><td>First Name</td><td>Kamila</td></tr>
<tr><td>Last Name</td><td>Harmon</td></tr>
<tr><td>Birthday</td><td>12/26/1989</td></tr>

find_all()-returns a collection of elements.
strip()- in-built function of Python is used to remove all the leading and trailing spaces from a string.
Ex.
from bs4 import BeautifulSoup
import requests
request = requests.post('https://www.getnewidentity.com/data/uk-identity-generator.php'
,data={"num":"undefine","add":"address","unique":"true"})
soup = BeautifulSoup(request.content,'lxml')
td = soup.find_all("td")
data = {}
for x in range(0,len(td)-1,2):
data[td[x].text.strip()] = td[x+1].text.strip()
print(data)
O/P:
{'Name': 'Jayda Key', 'Gender': 'Female', 'First Name': 'Jayda', 'Last Name': 'Key',
'Birthday': '55', 'NINO': 'EB 29 38 84 B', 'Address': 'Flat 31l\nMartin Walk, Leoberg, S81
0HT', 'Street Address': 'Flat 31l\nMartin Walk', 'State': 'Leoberg', 'Zip Code': 'S81 0HT',
'Phone': '+44(0)9487 957056', 'Credit Card Type': 'MasterCard', 'Credit Card Number':
'5246585772859818', 'CVV': '899', 'Expires': '02/2022', 'Username': 'twinhero', 'Email':
'Gamestomper#gmail.com', 'Password': 'Go7ByznZ', 'User Agent': 'Mozilla/5.0 (Macintosh;
Intel Mac OS X 10_11_6) AppleWebKit/601.7.7 (KHTML, like Gecko) Version/9.1.2
Safari/601.7.7', 'Height': '1.85m (6.17ft)', 'Weight': '75.22kg (158.31pounds)',
'Blood type': 'Oâˆ’'}

You say you want first name and last name; with bs4 4.7.1+ you can use :contains to target appropriately. As already detailed in other answer, content is dynamically retrieved from post xhr
from bs4 import BeautifulSoup as bs
import requests
r = requests.post('https://www.getnewidentity.com/data/uk-identity-generator.php',data={"num":"undefine","add":"address","unique":"true"})
soup = bs(r.content,'lxml')
first_name = soup.select_one('td:contains("First Name") + td').text
last_name = soup.select_one('td:contains("Last Name") + td').text
full_name = soup.select_one('td:contains("Name") + td').text
print(first_name, last_name, full_name)

Related

Python -BeautifulSoup - How to target nth child and print the text

I'm trying to scrape the "Biggest Gainers" list of coins on https://coinmarketcap.com/
How do I access the nth child (Biggest Gainers) in the div class_ = 'sc-1rmt1nr-0 sc-1rmt1nr-2 iMyvIy'
I managed to get the data from the "Trending" section but having trouble targeting the "Biggest Gainers" top 3 text items.
I get AttributeError: 'NoneType' object has no attribute 'p'
from bs4 import BeautifulSoup
import requests
source = requests.get('https://coinmarketcap.com/').text
soup = BeautifulSoup(source, 'lxml')
section = soup.find(class_='sc-1rmt1nr-0 sc-1rmt1nr-2 iMyvIy')
#List the top 3 Gainers
for top_gainers in section.find_all(class_='sc-16r8icm-0 sc-1uagfi2-0 bdEGog sc-1rmt1nr-1 eCWTbV')[1]:
top_gainers = top_gainers.find(class_='sc-1eb5slv-0 iworPT')
top_coins = top_gainers.p.text
print(top_coins)

I would avoid those dynamic classes and instead use -:soup-contains and combinators to first locate desired block via text, then with the combinators specify the relationship of the final elements to extract info from.
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
soup = bs(requests.get("https://coinmarketcap.com/").text, "lxml")
biggest_gainers = []
for i in soup.select(
'div[color=text]:has(span:-soup-contains("Biggest Gainers")) > div ~ div'
):
biggest_gainers.append(
{
"rank": int(i.select_one(".rank").text),
"currency": i.select_one(".alias").text,
"% change": f"{i.select_one('.icon-Caret-up').next_sibling}",
}
)
gainers = pd.DataFrame(biggest_gainers)
gainers

As mentioned by #QHarr you should avoid dynamic identifier similar to his approach the selection comes via :-soup-contains() and the known text of the element:
soup.select('div:has(>div>span:-soup-contains("Biggest Gainers")) ~ div')
To extract the texts I used stripped_strings and zipped it with the keys to a dict:
dict(zip(['rank','name','alias','change'],e.stripped_strings))
Example
from bs4 import BeautifulSoup
import requests
url = 'https://coinmarketcap.com/'
soup=BeautifulSoup(requests.get(url).content)
data = []
for e in soup.select('div:has(>div>span:-soup-contains("Biggest Gainers")) ~ div'):
data.append(dict(zip(['rank','name','alias','change'],e.stripped_strings)))
Output
[{'rank': '1', 'name': 'Tenset', 'alias': '10SET', 'change': '1406.99'},
{'rank': '2', 'name': 'Burn To Earn', 'alias': 'BTE', 'change': '348.89'},
{'rank': '3', 'name': 'MetaCars', 'alias': 'MTC', 'change': '332.05'}]

You can use :nth-of-type to locate the "Biggest Gainers" parent div:
import requests
from bs4 import BeautifulSoup as soup
d = soup(requests.get('https://coinmarketcap.com/').text, 'html.parser')
bg = d.select_one('div:nth-of-type(2).sc-16r8icm-0.sc-1uagfi2-0.bdEGog.sc-1rmt1nr-1.eCWTbV')
data = [{'rank':i.select_one('span.rank').text,
'name':i.select_one('p.sc-1eb5slv-0.iworPT').text,
'change':i.select_one('span.sc-27sy12-0.gLZJFn').text}
for i in bg.select('div.sc-1rmt1nr-0.sc-1rmt1nr-4.eQRTPY')]
Output:
[{'rank': '1', 'name': 'Tenset', 'change': '1308.72%'}, {'rank': '2', 'name': 'Burn To Earn', 'change': '421.82%'}, {'rank': '3', 'name': 'Aigang', 'change': '329.63%'}]

Python getting incomplete next page URL (BeautifulSoup, Request)

I am very new to Python and Web Scraping, http://books.toscrape.com/index.html for a project but I am stuck with the pagination logic. So far i managed to get every category, the book links and the informations i needed within them but i am struggling to scrape the next page URL for every category. The first problem is that the next page URL is incomplete (but that i can manage), the second probleme is that the base URL i have to use changes for every category.
Here is my code :
import requests
from bs4 import BeautifulSoup
project = []
url = 'http://books.toscrape.com'
r = requests.get(url)
soup = BeautifulSoup(r.text, "html.parser")
links = []
categories = soup.findAll("ul", class_="nav nav-list")
for category in categories:
hrefs = category.find_all('a', href=True)
for href in hrefs:
links.append(href['href'])
new_links = [element.replace("catalogue", "http://books.toscrape.com/catalogue") for element in links]
del new_links[0]
page = 0
books = []
for link in new_links:
r2 = requests.get(link).text
book_soup = BeautifulSoup(r2, "html.parser")
print("category: " + link)
nextpage = True
while nextpage:
book_link = book_soup.find_all(class_="product_pod")
for product in book_link:
a = product.find('a')
full_link = a['href'].replace("../../..", "")
print("book: " + full_link)
books.append("http://books.toscrape.com/catalogue" + full_link)
if book_soup.find('li', class_='next') is None:
nextpage = False
page += 1
print("end of pagination")
else:
next_page = book_soup.select_one('li.next>a')
print(next_page)
The part i am struggling is with the WHILE loop in "for link in new_links".
I am mostly looking for any example that can help me. Thank you!

If you do not want to scrape the links via http://books.toscrape.com/index.html directly while paging all the results you could get your goal like this:
from bs4 import BeautifulSoup
import requests
base_url = 'http://books.toscrape.com/'
soup = BeautifulSoup(requests.get(base_url).text)
books = []
for cat in soup.select('.nav-list ul a'):
cat_url = base_url+cat.get('href').rsplit('/',1)[0]
url = cat_url
while True:
soup = BeautifulSoup(requests.get(url).text)
##print(url)
books.extend(['http://books.toscrape.com/catalogue/'+a.get('href').strip('../../../') for a in soup.select('article h3 a')])
if soup.select_one('li.next a'):
url = f"{cat_url}/{soup.select_one('li.next a').get('href')}"
else:
break
books
Cause the result would be the same I would recommend to skip the way over categories:
from bs4 import BeautifulSoup
import requests
baseurl = 'http://books.toscrape.com/'
url = 'https://books.toscrape.com/catalogue/page-1.html'
soup = BeautifulSoup(requests.get(base_url).text)
books = []
while True:
soup = BeautifulSoup(requests.get(url).text)
for a in soup.select('article h3 a'):
bsoup = BeautifulSoup(requests.get(base_url+'catalogue/'+a.get('href')).content)
print(base_url+'catalogue/'+a.get('href'))
data = {
'title': bsoup.h1.text.strip(),
'category': bsoup.select('.breadcrumb li')[-2].text.strip(),
'url': base_url+'catalogue/'+a.get('href')
### add what ever is needed
}
data.update(dict(row.stripped_strings for row in bsoup.select('table tr')))
books.append(data)
if soup.select_one('li.next a'):
url = f"{url.rsplit('/',1)[0]}/{soup.select_one('li.next a').get('href')}"
else:
break
books
Output
[{'title': 'A Light in the Attic',
'category': 'Poetry',
'url': 'http://books.toscrape.com/catalogue/a-light-in-the-attic_1000/index.html',
'UPC': 'a897fe39b1053632',
'Product Type': 'Books',
'Price (excl. tax)': '£51.77',
'Price (incl. tax)': '£51.77',
'Tax': '£0.00',
'Availability': 'In stock (22 available)',
'Number of reviews': '0'},
{'title': 'Tipping the Velvet',
'category': 'Historical Fiction',
'url': 'http://books.toscrape.com/catalogue/tipping-the-velvet_999/index.html',
'UPC': '90fa61229261140a',
'Product Type': 'Books',
'Price (excl. tax)': '£53.74',
'Price (incl. tax)': '£53.74',
'Tax': '£0.00',
'Availability': 'In stock (20 available)',
'Number of reviews': '0'},
{'title': 'Soumission',
'category': 'Fiction',
'url': 'http://books.toscrape.com/catalogue/soumission_998/index.html',
'UPC': '6957f44c3847a760',
'Product Type': 'Books',
'Price (excl. tax)': '£50.10',
'Price (incl. tax)': '£50.10',
'Tax': '£0.00',
'Availability': 'In stock (20 available)',
'Number of reviews': '0'},...]

BeautifulSoup extracting multiple table

I am trying to extract some data from two tables from the same HTML with BeautifulSoup. Actually, I already extracted part from both tables but not all. This is the code that I have:
from urllib.request import urlopen
from bs4 import BeautifulSoup
html_content = urlopen('https://www.icewarehouse.com/Bauer_Vapor_X25_Ice_Hockey_Skates/descpage-V25XS.html')
soup = BeautifulSoup(html_content, "lxml")
tables = soup.find_all('table', attrs={'class' : 'orderingtable fl'})
for table_skates in tables:
t_headers = []
t_data = []
t_row = {}
for tr in table_skates.find_all('th'):
t_headers.append(tr.text.replace('\n', '').strip())
for td in table_skates.find_all('td'):
t_data.append(td.text.replace('\n', '').strip())
t_row = dict(zip(t_headers, t_data))
print(t_row)
Here is the output that I get:
{'Size': '1.0', 'Price': '$109.99', 'Stock': '1', 'Qty': ''}
{'Size': '7.0', 'Price': '$159.99', 'Stock': '2+', 'Qty': ''}

You can easily get it by using 'read_html' in 'pandas'.
df = pd.read_html(html_content, attrs={'class' : 'orderingtable fl'})

Can't parse customized results without using requests within scrapy

I've created a script using scrapy to fetch all the links connected to the name of different actors from imdb.com and then parse the first three of their movie links and finally scrape the name of director and writer of those movies. My script does it flawlessly if I stick to the current attempt. However, I've used requests module (which I don't want to) within parse_results method to get the customized output.
website address
What the script does (consider the first named link, as in Robert De Niro):
The script uses the above url and scrape the named link to parse the first three movie links from here located under the title Filmography.
Then it parses the name of directors and writers from here
This is I've written so far (working one):
import scrapy
import requests
from bs4 import BeautifulSoup
from scrapy.crawler import CrawlerProcess
class ImdbSpider(scrapy.Spider):
name = 'imdb'
start_urls = ['https://www.imdb.com/list/ls058011111/']
def parse(self, response):
soup = BeautifulSoup(response.text,"lxml")
for name_links in soup.select(".mode-detail")[:10]:
name = name_links.select_one("h3 > a").get_text(strip=True)
item_link = response.urljoin(name_links.select_one("h3 > a").get("href"))
yield scrapy.Request(item_link,meta={"name":name},callback=self.parse_items)
def parse_items(self,response):
name = response.meta.get("name")
soup = BeautifulSoup(response.text,"lxml")
item_links = [response.urljoin(item.get("href")) for item in soup.select(".filmo-category-section .filmo-row > b > a[href]")[:3]]
result_list = [i for url in item_links for i in self.parse_results(url)]
yield {"actor name":name,"associated name list":result_list}
def parse_results(self,link):
response = requests.get(link)
soup = BeautifulSoup(response.text,"lxml")
try:
director = soup.select_one("h4:contains('Director') ~ a").get_text(strip=True)
except Exception as e: director = ""
try:
writer = soup.select_one("h4:contains('Writer') ~ a").get_text(strip=True)
except Exception as e: writer = ""
return director,writer
c = CrawlerProcess({
'USER_AGENT': 'Mozilla/5.0',
})
c.crawl(ImdbSpider)
c.start()
Output the above script produces (desired ones):
{'actor name': 'Robert De Niro', 'associated name list': ['Jonathan Jakubowicz', 'Jonathan Jakubowicz', '', 'Anthony Thorne', 'Martin Scorsese', 'David Grann']}
{'actor name': 'Sidney Poitier', 'associated name list': ['Gregg Champion', 'Richard Leder', 'Gregg Champion', 'Sterling Anderson', 'Lloyd Kramer', 'Theodore Isaac Rubin']}
{'actor name': 'Daniel Day-Lewis', 'associated name list': ['Paul Thomas Anderson', 'Paul Thomas Anderson', 'Paul Thomas Anderson', 'Paul Thomas Anderson', 'Steven Spielberg', 'Tony Kushner']}
{'actor name': 'Humphrey Bogart', 'associated name list': ['', '', 'Mark Robson', 'Philip Yordan', 'William Wyler', 'Joseph Hayes']}
{'actor name': 'Gregory Peck', 'associated name list': ['', '', 'Arthur Penn', 'Tina Howe', 'Walter C. Miller', 'Peter Stone']}
{'actor name': 'Denzel Washington', 'associated name list': ['Joel Coen', 'Joel Coen', 'John Lee Hancock', 'John Lee Hancock', 'Antoine Fuqua', 'Richard Wenk']}
In the above approach I used requests module within parse_results method to get the desired output as I can't use yield within any list comprehension.
How can let the script produce the exact output without using requests?

One way you can address this is using Request.meta to keep a list of pending URLs for an item across requests, and pop URLs from it.
As #pguardiario mentions, the drawback is that you are still only processing one request from that list at a time. However, if you have more items than configured concurrency, that should not be a problem.
This approach would look like this:
def parse_items(self,response):
# …
if item_links:
meta = {
"actor name": name,
"associated name list": [],
"item_links": item_links,
}
yield Request(
item_links.pop(),
callback=self.parse_results,
meta=meta
)
else:
yield {"actor name": name}
def parse_results(self, response):
# …
response.meta["associated name list"].append((director, writer))
if response.meta["item_links"]:
yield Request(
response.meta["item_links"].pop(),
callback=self.parse_results,
meta=response.meta
)
else:
yield {
"actor name": response.meta["actor name"],
"associated name list": response.meta["associated name list"],
}

python BeautifulSoup parsing table

I'm learning python requests and BeautifulSoup. For an exercise, I've chosen to write a quick NYC parking ticket parser. I am able to get an html response which is quite ugly. I need to grab the lineItemsTable and parse all the tickets.
You can reproduce the page by going here: https://paydirect.link2gov.com/NYCParking-Plate/ItemSearch and entering a NY plate T630134C
soup = BeautifulSoup(plateRequest.text)
#print(soup.prettify())
#print soup.find_all('tr')
table = soup.find("table", { "class" : "lineItemsTable" })
for row in table.findAll("tr"):
cells = row.findAll("td")
print cells
Can someone please help me out? Simple looking for all tr does not get me anywhere.

Here you go:
data = []
table = soup.find('table', attrs={'class':'lineItemsTable'})
table_body = table.find('tbody')
rows = table_body.find_all('tr')
for row in rows:
cols = row.find_all('td')
cols = [ele.text.strip() for ele in cols]
data.append([ele for ele in cols if ele]) # Get rid of empty values
This gives you:
[ [u'1359711259', u'SRF', u'08/05/2013', u'5310 4 AVE', u'K', u'19', u'125.00', u'$'],
[u'7086775850', u'PAS', u'12/14/2013', u'3908 6th Ave', u'K', u'40', u'125.00', u'$'],
[u'7355010165', u'OMT', u'12/14/2013', u'3908 6th Ave', u'K', u'40', u'145.00', u'$'],
[u'4002488755', u'OMT', u'02/12/2014', u'NB 1ST AVE # E 23RD ST', u'5', u'115.00', u'$'],
[u'7913806837', u'OMT', u'03/03/2014', u'5015 4th Ave', u'K', u'46', u'115.00', u'$'],
[u'5080015366', u'OMT', u'03/10/2014', u'EB 65TH ST # 16TH AV E', u'7', u'50.00', u'$'],
[u'7208770670', u'OMT', u'04/08/2014', u'333 15th St', u'K', u'70', u'65.00', u'$'],
[u'$0.00\n\n\nPayment Amount:']
]
Couple of things to note:
The last row in the output above, the Payment Amount is not a part
of the table but that is how the table is laid out. You can filter it
out by checking if the length of the list is less than 7.
The last column of every row will have to be handled separately since it is an input text box.

Updated Answer
If a programmer is interested in only parsing a table from a webpage, they can utilize the pandas method pandas.read_html.
Let's say we want to extract the GDP data table from the website: https://worldpopulationreview.com/countries/countries-by-gdp/#worldCountries
Then following codes does the job perfectly (No need of beautifulsoup and fancy html):
Using pandas only
# sometimes we can directly read from the website
url = "https://en.wikipedia.org/wiki/AFI%27s_100_Years...100_Movies#:~:text=%20%20%20%20Film%20%20%20,%20%204%20%2025%20more%20rows%20"
df = pd.read_html(url)
df.head()
Using pandas and requests (More General Case)
# if pd.read_html does not work, we can use pd.read_html using requests.
import pandas as pd
import requests
url = "https://worldpopulationreview.com/countries/countries-by-gdp/#worldCountries"
r = requests.get(url)
df_list = pd.read_html(r.text) # this parses all the tables in webpages to a list
df = df_list[0]
df.head()
Required modules
pip install lxml
pip install requests
pip install pandas
Output

Solved, this is how your parse their html results:
table = soup.find("table", { "class" : "lineItemsTable" })
for row in table.findAll("tr"):
cells = row.findAll("td")
if len(cells) == 9:
summons = cells[1].find(text=True)
plateType = cells[2].find(text=True)
vDate = cells[3].find(text=True)
location = cells[4].find(text=True)
borough = cells[5].find(text=True)
vCode = cells[6].find(text=True)
amount = cells[7].find(text=True)
print amount

Here is working example for a generic <table>. (question links-broken)
Extracting the table from here countries by GDP (Gross Domestic Product).
htmltable = soup.find('table', { 'class' : 'table table-striped' })
# where the dictionary specify unique attributes for the 'table' tag
The tableDataText function parses a html segment started with tag <table> followed by multiple <tr> (table rows) and inner <td> (table data) tags. It returns a list of rows with inner columns. Accepts only one <th> (table header/data) in the first row.
def tableDataText(table):
rows = []
trs = table.find_all('tr')
headerow = [td.get_text(strip=True) for td in trs[0].find_all('th')] # header row
if headerow: # if there is a header row include first
rows.append(headerow)
trs = trs[1:]
for tr in trs: # for every table row
rows.append([td.get_text(strip=True) for td in tr.find_all('td')]) # data row
return rows
Using it we get (first two rows).
list_table = tableDataText(htmltable)
list_table[:2]
[['Rank',
'Name',
"GDP (IMF '19)",
"GDP (UN '16)",
'GDP Per Capita',
'2019 Population'],
['1',
'United States',
'21.41 trillion',
'18.62 trillion',
'$65,064',
'329,064,917']]
That can be easily transformed in a pandas.DataFrame for more advanced tools.
import pandas as pd
dftable = pd.DataFrame(list_table[1:], columns=list_table[0])
dftable.head(4)

I was interested in the tables in MediaWiki Version display such
as https://en.wikipedia.org/wiki/Special:Version
unit test
from unittest import TestCase
import pprint
class TestHtmlTables(TestCase):
'''
test the HTML Tables parsere
'''
def testHtmlTables(self):
url="https://en.wikipedia.org/wiki/Special:Version"
html_table=HtmlTable(url)
tables=html_table.get_tables("h2")
pp = pprint.PrettyPrinter(indent=2)
debug=True
if debug:
pp.pprint(tables)
pass
HtmlTable.py
'''
Created on 2022-10-25
#author: wf
'''
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
class HtmlTable(object):
'''
HtmlTable
'''
def __init__(self, url):
'''
Constructor
'''
req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
self.html_page = urlopen(req).read()
self.soup = BeautifulSoup(self.html_page, 'html.parser')
def get_tables(self,header_tag:str=None)->dict:
"""
get all tables from my soup as a list of list of dicts
Args:
header_tag(str): if set search the table name from the given header tag
Return:
dict: the list of list of dicts for all tables
"""
tables = {}
for i,table in enumerate(self.soup.find_all("table")):
fields = []
table_data=[]
for tr in table.find_all('tr', recursive=True):
for th in tr.find_all('th', recursive=True):
fields.append(th.text)
for tr in table.find_all('tr', recursive=True):
record= {}
for i, td in enumerate(tr.find_all('td', recursive=True)):
record[fields[i]] = td.text
if record:
table_data.append(record)
if header_tag is not None:
header=table.find_previous_sibling(header_tag)
table_name=header.text
else:
table_name=f"table{i}"
tables[table_name]=(table_data)
return tables
Result
Finding files... done.
Importing test modules ... done.
Tests to run: ['TestHtmlTables.testHtmlTables']
testHtmlTables (tests.test_html_table.TestHtmlTables) ... Starting test testHtmlTables, debug=False ...
{ 'Entry point URLs': [ {'Entry point': 'Article path', 'URL': '/wiki/$1'},
{'Entry point': 'Script path', 'URL': '/w'},
{'Entry point': 'index.php', 'URL': '/w/index.php'},
{'Entry point': 'api.php', 'URL': '/w/api.php'},
{'Entry point': 'rest.php', 'URL': '/w/rest.php'}],
'Installed extensions': [ { 'Description': 'Brad Jorsch',
'Extension': '1.0 (b9a7bff) 01:45, 9 October '
'2022',
'License': 'Get a summary of logged API feature '
'usages for a user agent',
'Special pages': 'ApiFeatureUsage',
'Version': 'GPL-2.0-or-later'},
{ 'Description': 'Brion Vibber, Kunal Mehta, Sam '
'Reed, Aaron Schulz, Brad Jorsch, '
'Umherirrender, Marius Hoch, '
'Andrew Garrett, Chris Steipp, '
'Tim Starling, Gergő Tisza, '
'Alexandre Emsenhuber, Victor '
'Vasiliev, Glaisher, DannyS712, '
'Peter Gehres, Bryan Davis, James '
'D. Forrester, Taavi Väänänen and '
'Alexander Vorwerk',
'Extension': '– (df2982e) 23:10, 13 October 2022',
'License': 'Merge account across wikis of the '
'Wikimedia Foundation',
'Special pages': 'CentralAuth',
'Version': 'GPL-2.0-or-later'},
{ 'Description': 'Tim Starling and Aaron Schulz',
'Extension': '2.5 (648cfe0) 06:20, 17 October '
'2022',
'License': 'Grants users with the appropriate '
'permission the ability to check '
"users' IP addresses and other "
'information',
'Special pages': 'CheckUser',
'Version': 'GPL-2.0-or-later'},
{ 'Description': 'Ævar Arnfjörð Bjarmason and '
'James D. Forrester',
'Extension': '– (2cf4aaa) 06:41, 14 October 2022',
'License': 'Adds a citation special page and '
'toolbox link',
'Special pages': 'CiteThisPage',
'Version': 'GPL-2.0-or-later'},
{ 'Description': 'PediaPress GmbH, Siebrand '
'Mazeland and Marcin Cieślak',
'Extension': '1.8.0 (324e738) 06:20, 17 October '
'2022',
'License': 'Create books',
'Special pages': 'Collection',
'Version': 'GPL-2.0-or-later'},
{ 'Description': 'Amir Aharoni, David Chan, Joel '
'Sahleen, Kartik Mistry, Niklas '
'Laxström, Pau Giner, Petar '
'Petković, Runa Bhattacharjee, '
'Santhosh Thottingal, Siebrand '
'Mazeland, Sucheta Ghoshal and '
'others',
'Extension': '– (56fe095) 11:56, 17 October 2022',
'License': 'Makes it easy to translate content '
'pages',
'Special pages': 'ContentTranslation',
'Version': 'GPL-2.0-or-later'},
{ 'Description': 'Andrew Garrett, Ryan Kaldari, '
'Benny Situ, Luke Welling, Kunal '
'Mehta, Moriel Schottlender, Jon '
'Robson and Roan Kattouw',
'Extension': '– (cd01f9b) 06:21, 17 October 2022',
'License': 'System for notifying users about '
'events and messages',
'Special pages': 'Echo',
'Version': 'MIT'},
..
'Installed libraries': [ { 'Authors': 'Benjamin Eberlei and Richard Quadling',
'Description': 'Thin assertion library for input '
'validation in business models.',
'Library': 'beberlei/assert',
'License': 'BSD-2-Clause',
'Version': '3.3.2'},
{ 'Authors': '',
'Description': 'Arbitrary-precision arithmetic '
'library',
'Library': 'brick/math',
'License': 'MIT',
'Version': '0.8.17'},
{ 'Authors': 'Christian Riesen',
'Description': 'Base32 encoder/decoder according '
'to RFC 4648',
'Library': 'christian-riesen/base32',
'License': 'MIT',
'Version': '1.6.0'},
...
{ 'Authors': 'Readers Web Team, Trevor Parscal, Roan '
'Kattouw, Alex Hollender, Bernard Wang, '
'Clare Ming, Jan Drewniak, Jon Robson, '
'Nick Ray, Sam Smith, Stephen Niedzielski '
'and Volker E.',
'Description': 'Provides 2 Vector skins:\n'
'\n'
'2011 - The Modern version of MonoBook '
'with fresh look and many usability '
'improvements.\n'
'2022 - The Vector built as part of '
'the WMF mw:Desktop Improvements '
'project.',
'License': 'GPL-2.0-or-later',
'Skin': 'Vector',
'Version': '1.0.0 (93f11b3) 20:24, 17 October 2022'}],
'Installed software': [ { 'Product': 'MediaWiki',
'Version': '1.40.0-wmf.6 (bb4c5db)17:39, 17 '
'October 2022'},
{'Product': 'PHP', 'Version': '7.4.30 (fpm-fcgi)'},
{ 'Product': 'MariaDB',
'Version': '10.4.25-MariaDB-log'},
{'Product': 'ICU', 'Version': '63.1'},
{'Product': 'Pygments', 'Version': '2.10.0'},
{'Product': 'LilyPond', 'Version': '2.22.0'},
{'Product': 'Elasticsearch', 'Version': '7.10.2'},
{'Product': 'LuaSandbox', 'Version': '4.0.2'},
{'Product': 'Lua', 'Version': '5.1.5'}]}
test testHtmlTables, debug=False took 1.2 s
ok
----------------------------------------------------------------------
Ran 1 test in 1.204s
OK

from behave import *
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as ec
import pandas as pd
import requests
from bs4 import BeautifulSoup
from tabulate import tabulate
class readTableDataFromDB:
def LookupValueFromColumnSingleKey(context, tablexpath, rowName, columnName):
print("element present readData From Table")
element = context.driver.find_elements_by_xpath(tablexpath+"/descendant::th")
indexrow = 1
indexcolumn = 1
for values in element:
valuepresent = values.text
print("text present here::"+valuepresent+"rowName::"+rowName)
if valuepresent.find(columnName) != -1:
print("current row"+str(indexrow) +"value"+valuepresent)
break
else:
indexrow = indexrow+1
indexvalue = context.driver.find_elements_by_xpath(
tablexpath+"/descendant::tr/td[1]")
for valuescolumn in indexvalue:
valuepresentcolumn = valuescolumn.text
print("Team text present here::" +
valuepresentcolumn+"columnName::"+rowName)
print(indexcolumn)
if valuepresentcolumn.find(rowName) != -1:
print("current column"+str(indexcolumn) +
"value"+valuepresentcolumn)
break
else:
indexcolumn = indexcolumn+1
print("index column"+str(indexcolumn))
print(tablexpath +"//descendant::tr["+str(indexcolumn)+"]/td["+str(indexrow)+"]")
#lookupelement = context.driver.find_element_by_xpath(tablexpath +"//descendant::tr["+str(indexcolumn)+"]/td["+str(indexrow)+"]")
#print(lookupelement.text)
return context.driver.find_elements_by_xpath(tablexpath+"//descendant::tr["+str(indexcolumn)+"]/td["+str(indexrow)+"]")
def LookupValueFromColumnTwoKeyssss(context, tablexpath, rowName, columnName, columnName1):
print("element present readData From Table")
element = context.driver.find_elements_by_xpath(
tablexpath+"/descendant::th")
indexrow = 1
indexcolumn = 1
indexcolumn1 = 1
for values in element:
valuepresent = values.text
print("text present here::"+valuepresent)
indexrow = indexrow+1
if valuepresent == columnName:
print("current row value"+str(indexrow)+"value"+valuepresent)
break
for values in element:
valuepresent = values.text
print("text present here::"+valuepresent)
indexrow = indexrow+1
if valuepresent.find(columnName1) != -1:
print("current row value"+str(indexrow)+"value"+valuepresent)
break
indexvalue = context.driver.find_elements_by_xpath(
tablexpath+"/descendant::tr/td[1]")
for valuescolumn in indexvalue:
valuepresentcolumn = valuescolumn.text
print("Team text present here::"+valuepresentcolumn)
print(indexcolumn)
indexcolumn = indexcolumn+1
if valuepresent.find(rowName) != -1:
print("current column"+str(indexcolumn) +
"value"+valuepresentcolumn)
break
print("indexrow"+str(indexrow))
print("index column"+str(indexcolumn))
lookupelement = context.driver.find_element_by_xpath(
tablexpath+"//descendant::tr["+str(indexcolumn)+"]/td["+str(indexrow)+"]")
print(tablexpath +
"//descendant::tr["+str(indexcolumn)+"]/td["+str(indexrow)+"]")
print(lookupelement.text)
return context.driver.find_element_by_xpath(tablexpath+"//descendant::tr["+str(indexrow)+"]/td["+str(indexcolumn)+"]")

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

How to scrape specfic tr or td from a table python - python

Related

Python -BeautifulSoup - How to target nth child and print the text

Python getting incomplete next page URL (BeautifulSoup, Request)

BeautifulSoup extracting multiple table

Can't parse customized results without using requests within scrapy

python BeautifulSoup parsing table

Categories

Resources