How I can get page with other language not default
My code:
import requests
from bs4 import BeautifulSoup
def scrape(page):
url = page
result = requests.get(url, stream=True)
if result.status_code == 200:
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
return soup
product=scrape("https://www.tatechnix.de/tatechnix/gx/product_info.php?info=p44232")
print(product)
and now when i open this page i get DE lang how I can get it with EN lang?
They havent other URL or prefix for language. Only changes in button.
Edit:
import re
import requests
from bs4 import BeautifulSoup
def scrape(page):
headers = {'Accept-Language': 'en-US,en;q=0.8'}
url = page
result = requests.get(url, stream=True)
if result.status_code == 200:
getPage= requests.get(url, headers=headers)
soup = BeautifulSoup(getPage.content, 'html.parser')
title=soup.select('.product-info-title-desktop')[0].text
return title
product=scrape("https://www.tatechnix.de/tatechnix/gx/product_info.php?info=p44232")
print(product)
Nothing change :/
Try it:
import re
import requests
from bs4 import BeautifulSoup
def scrape(page):
url = page
result = requests.get(url, stream=True)
if result.status_code == 200:
getPage= requests.get(url)
soup = BeautifulSoup(getPage.content, 'html.parser')
title=soup.select('.product-info-title-desktop')[0].text
return title
product=scrape("https://www.tatechnix.de/tatechnix/gx/product_info.php?info=p44232&language=en")
print(product)
Related
i am trying to write a scraper but i have faced with an issue.
I can parse "class in spans" and "class in div" but when i try to parse "id in span" it doesn't print the data i want.
from bs4 import BeautifulSoup
from urllib import request
from urllib.request import Request, urlopen
req = Request('https://bscscan.com/token/0xc3d33bdd0b6cea10eb496fbc7592e45f2624c0a5', headers={'User-Agent': 'Mozilla/5.0'})
webpage = urlopen(req).read()
soup = BeautifulSoup(webpage, 'html.parser')
name = soup.find('span', class_='text-secondary small').text
add = soup.find('div', class_='mr-3').text
trans = soup.find('span', attrs={'id':'totaltxns'}).text
print(name, add, trans)
You need to pick up a session cookie then make a request to an additional endpoint. sid needs to be dynamically picked up as well.
import requests, re
def get_transfer_count(str:token)->str:
with requests.Session() as s:
s.headers = {'User-Agent':'Mozilla/5.0'}
r = s.get(f'https://bscscan.com/token/{token}')
sid = re.search(r"var sid = '(.*?)'", r.text).group(1)
r = s.get(f'https://bscscan.com/token/generic-tokentxns2?m=normal&contractAddress={token}&a=&sid={sid}&p=1')
return re.search(r"var totaltxns = '(.*?)'", r.text).group(1)
token = '0x8df9655178350146eAD837B15ba3D32c9Fe1497d'
get_transfer_count(token)
How do I extract the underlined value in red below and save it as a list?
You want to extract the Memcode value in href to a in p tag using soup.
However, I don't know how to extract it at all.
Please help me.
My code
import urllib.request
from bs4 import BeautifulSoup
url = "https://www.council.ulsan.kr/kor/councillor/viewByPerson.do"
req = urllib.request.Request(url)
sourcecode = urllib.request.urlopen(url).read()
soup = BeautifulSoup(sourcecode, "html.parser")
list = []
for href in soup.find("div", class_="memList memList-col-3").find_all("a"):
print(href)
try this, using css selector
import requests
from bs4 import BeautifulSoup
resp = requests.get('https://www.gjcouncil.go.kr/kr/member/name.do')
soup = BeautifulSoup(resp.text, "html.parser")
for a in soup.select("div[id='member_list'] > ul > li > a"):
print(a['href'].split("/")[2])
08070
00716
08040
....
....
You can use split on the "=" and take the -1 index. I also changed the class .
import urllib.request
from bs4 import BeautifulSoup
url = "https://www.council.ulsan.kr/kor/councillor/viewByPerson.do"
req = urllib.request.Request(url)
sourcecode = urllib.request.urlopen(url).read()
soup = BeautifulSoup(sourcecode, "html.parser")
ids = [i['href'].split('=')[-1] for i in soup.select('.btn-home')]
import urllib.request
from bs4 import BeautifulSoup
url = "https://www.council.ulsan.kr/kor/councillor/viewByPerson.do"
req = urllib.request.Request(url)
sourcecode = urllib.request.urlopen(url).read()
soup = BeautifulSoup(sourcecode, "html.parser")
href_list = []
for href in soup.find("div", class_="memList memList-col-3").find_all("a"):
if href['href'] == '#LINK':
pass
else:
href_list.append(href['href'][-7:])
print(href_list)
['7620212', '7670126', '7670420', '7650601', '7890930', '7800407', '7660925', '7641102', '7731222', '7801011', '7570803', '7770106', '7590808', '7700831', '7580115', '7710713', '7680112', '7621125', '7711117', '7680213', '7640925', '7591214']
One of the best method is using Regular-Expression.
Check out this code :
import urllib.request
from bs4 import BeautifulSoup
import re
url = "https://www.council.ulsan.kr/kor/councillor/viewByPerson.do"
req = urllib.request.Request(url)
sourcecode = urllib.request.urlopen(url).read()
soup = BeautifulSoup(sourcecode, "html.parser")
list_ = []
for href in soup.find("div", class_="memList memList-col-3").find_all("a"):
list_.append(href['href'])
regobj = re.compile(r'memCode=(\w+)')
final = list(filter('#LINK'.__ne__, list_))
result = list(map(lambda i: regobj.search(i).group(1) ,final))
print(result)
import requests
from bs4 import BeautifulSoup
def get_page(url):
response = requests.get(url)
if not response.ok:
print('Server Responded: ', response.status_code)
else:
soup = BeautifulSoup(response.text, 'lxml')
return soup
def get_detail_data(soup):
#price
#item
h1 = soup.find('h1', id='itemTitle')
print(h1)
def main():
url = "https://www.ebay.com/itm/New-Longines-Master-Collection-Automatic-40mm-White-Mens-Watch-L2-909-4-78-3/383525040495?hash=item594bdfb16f:g:vdIAAOSwytheqbKu"
get_detail_data(get_page(url))
if __name__ == '__main__':
main()
hi please help me with how I can select the item name on e-bay. The item name is the title of the watch. I managed to get to the then to itemTitle.
Example
import requests
from bs4 import BeautifulSoup
def get_page(url):
response = requests.get(url=url)
if not response.ok:
print('Server Responded: ', response.status_code)
else:
soup = BeautifulSoup(response.text, features='html.parser')
return soup
def get_detail_data(soup):
h1 = soup.select("span.g-hdn")[0]
print(h1.next_sibling)
return h1
if __name__ == "__main__":
url = "https://www.ebay.com/itm/New-Longines-Master-Collection-Automatic-40mm-White-Mens-Watch-L2-909-4-78-3/383525040495?hash=item594bdfb16f:g:vdIAAOSwytheqbKu"
get_detail_data(get_page(url))
Prints out
New Longines Master Collection Automatic 40mm White Men's Watch L2.909.4.78.3
I want to scrape the titles of the PDFs on this website. However, I get the titles and the links. How can I fix this?
publications=[]
text=[]
for i in np.arange(12,19):
response=requests.get('https://occ.ca/our-
publications/page/{}/'.format(i), headers={'User-Agent': 'Mozilla'})
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'lxml')
pdfs = soup.findAll('div', {"class": "publicationoverlay"})
links = [pdf.find('a').attrs['href'] for pdf in pdfs]
publications.extend(links)
text.extend(pdfs)
Any help would be much appreciated.
You want the .text though split on \t (to exclude child a text) and strip. I use Session for efficiency.
import requests
from bs4 import BeautifulSoup
import numpy as np
publications=[]
text=[]
with requests.Session() as s:
for i in np.arange(12,19):
response= s.get('https://occ.ca/our-publications/page/{}/'.format(i), headers={'User-Agent': 'Mozilla'})
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'lxml')
pdfs = soup.findAll('div', {"class": "publicationoverlay"})
text.extend([pdf.text.strip().split('\t')[0] for pdf in pdfs])
You could also use decompose to remove child a tags after getting href and before taking the .text of parent
import requests
from bs4 import BeautifulSoup
import numpy as np
publications=[]
text=[]
links = []
with requests.Session() as s:
for i in np.arange(12,19):
response= s.get('https://occ.ca/our-publications/page/{}/'.format(i), headers={'User-Agent': 'Mozilla'})
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'lxml')
for a in soup.select('.publicationoverlay a'):
links.extend([a['href']])
a.decompose()
pdfs = soup.findAll('div', {"class": "publicationoverlay"})
text.extend([pdf.text.strip() for pdf in pdfs])
print(list(zip(links, text)))
Little problem with BeautifulSoup:
from bs4 import BeautifulSoup
import requests
link = "http://www.cnnvd.org.cn/web/vulnerability/querylist.tag"
req = requests.get(link)
web = req.text
soup = BeautifulSoup(web, "lxml")
cve_name = []
cve_link = []
for par_ in soup.find_all('div', attrs={'class':'fl'}):
for link_ in par_.find_all('p'):
for text_ in link_.find_all('a'):
print (text_.string)
print (text_['href'])
print ("==========")
#cve_name.append(text_.string)
#cve_link.append(text_['href'])
And it gives me twice records :V That probably is easy to solve :V
The same elements are in two places on page so you have to use find()/find_all() to select only one place i.e find(class_='list_list') in
soup.find(class_='list_list').find_all('div', attrs={'class':'fl'}):
Full code:
from bs4 import BeautifulSoup
import requests
link = "http://www.cnnvd.org.cn/web/vulnerability/querylist.tag"
req = requests.get(link)
web = req.text
soup = BeautifulSoup(web, "lxml")
cve_name = []
cve_link = []
for par_ in soup.find(class_='list_list').find_all('div', attrs={'class':'fl'}):
print(len(par_))
for link_ in par_.find_all('p'):
for text_ in link_.find_all('a'):
print (text_.string)
print (text_['href'])
print ("==========")
#cve_name.append(text_.string)
#cve_link.append(text_['href'])
How about this. I used css selectors to do the same.
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import requests
link = "http://www.cnnvd.org.cn/web/vulnerability/querylist.tag"
res = requests.get(link)
soup = BeautifulSoup(res.text, "lxml")
for item in soup.select('.fl p a'):
print("Item: {}\nItem_link: {}".format(item.text,urljoin(link,item['href'])))
Partial Output:
Item: CNNVD-201712-811
Item_link: http://www.cnnvd.org.cn/web/xxk/ldxqById.tag?CNNVD=CNNVD-201712-811
Item: CNNVD-201712-810
Item_link: http://www.cnnvd.org.cn/web/xxk/ldxqById.tag?CNNVD=CNNVD-201712-810
Item: CNNVD-201712-809
Item_link: http://www.cnnvd.org.cn/web/xxk/ldxqById.tag?CNNVD=CNNVD-201712-809