Can`t crawl web site - python

I want to crawl , but i have some troubles in it, i need to open every link of the good and get information of it and save it in .html every good at the page
for now i can only print all links on the page
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen as uReq
import requests
import urllib3
import ssl
from requests import request
urllib3.disable_warnings()
try:
_create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
pass
else:
ssl._create_default_https_context = _create_unverified_https_context
PYTHONHTTPSVERIFY=0
user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3)
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'
headers = {'User-Agent': user_agent}
t = request('GET', url=my_url, headers=headers, verify=False).text
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
page_soup = soup(page_html, "html.parser")
containers = page_soup.findAll("div", {"class": 'product'})
filename = "web.html"
f= open(filename, "w")
for containers in page_soup.findAll('div', attrs={'class': 'product'}):
f.write(containers.a['href'] + '\n')
f.close()

You appear to be trying to get a list of URLs from a first URL and then wanting to get some information from each of these URLs. To do this, a request for each URL needs to be made and a separate BeautifulSoup parse is needed for each.
Once you have the sub page, information can then be extracted, for example the name of the product and its price.
Finally you could either print this information or write it to a file. The easiest way is to write it as a CSV file. In this example I show how you could write the URL and name and the price as a single row. The CSV library automatically formats correctly:
from urllib3.exceptions import InsecureRequestWarning
from bs4 import BeautifulSoup
import requests
import csv
requests.packages.urllib3.disable_warnings(category=InsecureRequestWarning)
my_url = 'https://franke-market.com.ua/moyki.html?on_page=100'
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'}
req = requests.get(my_url, headers=headers, verify=False)
soup = BeautifulSoup(req.content, "html.parser")
with open("products.csv", "w", newline="") as f_output:
csv_output = csv.writer(f_output)
csv_output.writerow(['URL', 'Title', 'Price'])
# Find the URLs for all the products
for div in soup.find_all('div', attrs={'class': 'product'})[:5]:
url = div.a['href']
print(url)
# For each URL, get the sub page and get the name and price
req_sub = requests.get(url, headers=headers, verify=False)
soup_sub = BeautifulSoup(req_sub.content, "html.parser")
title = soup_sub.find('h1', class_='title').text
price_info = soup_sub.find('div', class_='price_info').span.text
# Write the url, name and price as a CSV file
csv_output.writerow([url, title, price_info])
Giving you and output.csv file starting:
URL,Title,Price
https://franke-market.com.ua/franke_rol_610-38_101_0267_707_.html,Franke ROL 610-38 (101.0267.707),91795
https://franke-market.com.ua/franke-pmn-611i-101.0255.790.html,Franke Pamira PMN 611i (101.0255.790),57935
https://franke-market.com.ua/franke_pxl_611-60_101_0330_655_.html,Franke PXL 611-60 (101.0330.655),93222
https://franke-market.com.ua/franke-ron-610-41-101.0255.783.html,Franke Ron 610-41 (101.0255.783),57939
https://franke-market.com.ua/franke_pxl_611-78_101_0330_657_.html,Franke PXL 611-78 (101.0330.657),93223
You could then open this file into a spreadsheet application.

Related

Parsing text with bs4 works with selenium but does not work with requests in Python

This code works and returns the single digit number that i want but its so slow and takes good 10 seconds to complete.I will be running this 4 times for my use so thats 40 seconds wasted every run.
` from selenium import webdriver
from bs4 import BeautifulSoup
options = webdriver.FirefoxOptions()
options.add_argument('--headless')
driver = webdriver.Firefox(options=options)
driver.get('https://warframe.market/items/ivara_prime_blueprint')
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
price_element = soup.find('div', {'class': 'row order-row--Alcph'})
price2=price_element.find('div',{'class':'order-row__price--hn3HU'})
price = price2.text
print(int(price))
driver.close()`
This code on the other hand does not work. It returns None.
` import requests
from bs4 import BeautifulSoup
url='https://warframe.market/items/ivara_prime_blueprint'
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
price_element=soup.find('div', {'class': 'row order-row--Alcph'})
price2=price_element.find('div',{'class':'order-row__price--hn3HU'})
price = price2.text
print(int(price))`
First thought was to add user agent but still did not work. When I print(soup) it gives me html code but when i parse it further it stops and starts giving me None even tho its the same command like in selenium example.
The data is loaded dynamically within a <script> tag so Beautifulsoup doesn't see it (it doesn't render Javascript).
As an example, to get the data, you can use:
import json
import requests
from bs4 import BeautifulSoup
url = "https://warframe.market/items/ivara_prime_blueprint"
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36"
}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")
script_tag = soup.select_one("#application-state")
json_data = json.loads(script_tag.string)
# Uncomment the line below to see all the data
# from pprint import pprint
# pprint(json_data)
for data in json_data["payload"]["orders"]:
print(data["user"]["ingame_name"])
Prints:
Rogue_Monarch
Rappei
KentKoes
Tenno61189
spinifer14
Andyfr0nt
hollowberzinho
You can access the data as a dict and acess the keys/values.
I'd recommend an online tool to view all the JSON since it's quite large.
See also
Parsing out specific values from JSON object in BeautifulSoup

Get text from <span class: with Beautifulsoup and requests

so I tried to get a specific text from a website but it only gives me the error (floor = soup.find('span', {'class': 'text-white fs-14px text-truncate attribute-value'}).text
AttributeError: 'NoneType' object has no attribute 'text')
I specifically want to get the 'Floor Price' text.
My code:
import bs4
from bs4 import BeautifulSoup
#target url
url = "https://magiceden.io/marketplace/solsamo"
#act like browser
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
response = requests.get('https://magiceden.io/marketplace/solsamo')
#parse the downloaded page
soup = BeautifulSoup(response.content, 'lxml')
floor = soup.find('span', {'class': 'text-white fs-14px text-truncate attribute-value'}).text
print(floor)
There is no needed data in HTML you receive after:
response = requests.get('https://magiceden.io/marketplace/solsamo')
You can make sure of this by looking at page source code:
view-source:https://magiceden.io/marketplace/solsamo
You should use Selenium instead requests to get your data or you can examine XHR-requests on this page, maybe you can get this data using requests by following other link.

How do I retrieve the text between those

https://imgur.com/a/JcTnbiw
how do I retrieve the highlighted text with beautifulsoup?
a example would be the best answer, thank you ;)
edit; heres the code
import requests
import pyperclip
from bs4 import BeautifulSoup
import time
url = 'https://sales.elhst.co/'
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.125 Safari/537.36"}
site = requests.get(url, headers=headers)
site = str(site)
if site == "<Response [200]>":
print("Site is up..")
page = requests.get(url, headers=headers)
soup = BeautifulSoup(page.content, 'html.parser')
time.sleep(2)
target = soup.find("pp", id="copies")
print(target)
and the output is:
Site is up..
<pp id="copies"></pp>
and i wanna to get this text:
https://imgur.com/a/JcTnbiw
is there any way to do it?
The data you see on the page is loaded from external URL. You can try this script to print number of copies:
import re
import json
import requests
url = 'https://sales.elhst.co/socket.io/?EIO=3&transport=polling'
copies_url = 'https://sales.elhst.co/socket.io/?EIO=3&transport=polling&sid={sid}'
r = requests.get(url).text
sid = json.loads(re.search(r'(\{".*)', r).group(1))['sid']
r = requests.get(copies_url.format(sid=sid)).text
copies = json.loads(re.search(r'(\[".*)', r).group(1))[-1]
print(copies)
Prints:
0
from lxml import html
import requests
page = requests.get('http://url')
tree = html.fromstring(page.content)
#This will extract the text you need
buyers = tree.xpath('//pp[#id="copies"]/text()')
It should work. But I don't know pp tag. I think it's a mistake and there should be tag <p>.
More info about lxml here.

how to extract amazon product links using python

I'm a Beginner in Python, I just want to scrap product links from amazon page.
for example, I want to scrap this page
http://www.amazon.com/s/ref=sr_in_-2_p_4_18?me=A3MZ96G5C78IVQ&fst=as%3Aoff&rh=p_4%3AFunKo&ie=UTF8&qid=1477811368 and I use this code in python
from bs4 import BeautifulSoup
import requests
url = "http://www.amazon.com/s/ref=sr_in_-2_p_4_18?me=A3MZ96G5C78IVQ&fst=as%3Aoff&rh=p_4%3AFunKo&ie=UTF8&qid=1477811368"
r = requests.get(url)
soup = BeautifulSoup(r.content, "lxml")
file = open("parseddata.txt", "wb")
links = soup.find_all('a', {'class': 'a-link-normal s-access-detail-page a-text-normal'})
for link in links:
print(link.get('href'))
file.write(href + '\n')
file.close()
I Just want the products title link as the output. Can anyone tell me where I am doing wrong.
Add an user-agent to the request header to pretend that you are not a robot.
from bs4 import BeautifulSoup
import requests
url = "http://www.amazon.com/s/ref=sr_in_-2_p_4_18?me=A3MZ96G5C78IVQ&fst=as%3Aoff&rh=p_4%3AFunKo&ie=UTF8&qid=1477811368"
# add header
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36'
}
r = requests.get(url, headers=headers)
soup = BeautifulSoup(r.content, "lxml")
file = open(r"parseddata.txt", "w")
links = soup.find_all('a', {'class': 'a-link-normal s-access-detail-page a-text-normal'})
for link in links:
print(link.get('href'))
file.write(link.get('href')+ '\n')
file.close()
Result
https://www.amazon.com/Funko-POP-Marvel-Dancing-Bobble/dp/B00N1EJXUU/ref=sr_1_1/160-5408618-6684940?m=A3MZ96G5C78IVQ&s=merchant-items&ie=UTF8&qid=1477822032&sr=1-1&refinements=p_4%3AFunKo
https://www.amazon.com/Funko-POP-Movies-Potter-Action/dp/B019JIA4IQ/ref=sr_1_2/160-5408618-6684940?m=A3MZ96G5C78IVQ&s=merchant-items&ie=UTF8&qid=1477822032&sr=1-2&refinements=p_4%3AFunKo
https://www.amazon.com/FunKo-2390-Funko-Darth-Maul/dp/B005F1QBMK/ref=sr_1_3/160-5408618-6684940?m=A3MZ96G5C78IVQ&s=merchant-items&ie=UTF8&qid=1477822032&sr=1-3&refinements=p_4%3AFunKo
........

Beautifulsoup can't extract data in this website

import requests
from bs4 import BeautifulSoup
import lxml
import urllib2
opener = urllib2.build_opener()
opener.addheaders = [('User-agent', 'Mozilla/5.0')]
f =open('ala2009link.csv','r')
s=open('2009alanews.csv','w')
for row in csv.reader(f):
url=row[0]
print url
res = requests.get(url)
print res.content
soup = BeautifulSoup(res.content)
print soup
data=soup.find_all("article",{"class":"article-wrapper news"})
#data=soup.find_all("main",{"class":"main-content"})
for item in data:
title= item.find_all("h2",{"class","article-headline"})[0].text
s.write("%s \n"% title)
content=soup.find_all("p")
for main in content:
k=main.text.encode('utf-8')
s.write("%s \n"% k)
#k=csv.writer(s)
#k.writerow('%s\n'% (main))
s.close()
f.close()
this is my code to extract data in website ,but i don't know why i can't extract data ,is this ad blocker warning to block my beautifulsoup ?
this is the example link:http://www.rolltide.com/news/2009/6/23/Bert_Bank_Passes_Away.aspx?path=football
The reason that no results are returned is because this website requires that you have a User-Agent header in your request.
To fix this add a headers parameter with a User-Agent to the requests.get() like so.
url = 'http://www.rolltide.com/news/2009/6/23/Bert_Bank_Passes_Away.aspx?path=football'
headers = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/29.0.1547.65 Chrome/29.0.1547.65 Safari/537.36',
}
res = requests.get(url, headers=headers)

Categories