Scraping a class element - python

I want to scrape class="cms-no-route cms-noroute-index page-layout-1column" in <body data-containr="body" class="cms-no-route cms-noroute-index page-layout-1column"> and save it in a txt file, but for some reason when I run the script nothing happens.
def get():
source = requests.get("https://shop.adidas.ae/en/yeezy-boost-350-v2-ash-pearl/GY7658.html", headers=randomheaders.LoadHeader()).text
soup = BeautifulSoup(source, 'lxml')
x = soup.find_all('body', datacontainer_="body")
url = x.get('class')
filename = "adidaslive.txt"
with open(filename, "r") as rf:
with open(filename, "a") as af:
if url not in rf:
print(url)
af.write("\n" + url)
else:
print("nothing")

def get():
source = ...
You need to properly indent your code
x = soup.find_all('body', datacontainer_="body")
x here is a list with only one element because, there is only one 'body' tag in your html source code.

Related

How Can I read URLs from text file located on my PC?

I have a python code that scraps data from a website. This code works fine, but I want to change the URL source to a text list on my desktop. The urls in my text file are each one in a line.
How do you suggest I should read this file and loop through urls?
Thanks in advance for your time.
import csv
import requests
from bs4 import BeautifulSoup
csv_file = open('cms_scrape.csv', 'w')
csv_writer = csv.writer(csv_file)
csv_writer.writerow(['name', 'link', 'price'])
for x in range(0, 70):
try:
urls = 'https://www.meisamatr.com/fa/product/cat/2-%D8%A2%D8%B1%D8%A7%DB%8C%D8%B4%DB%8C.html&pagesize[]=24&order[]=new&stock[]=1&page[]=' + str(x + 1) + '&ajax=ok?_=1561559181560'
source = requests.get(urls).text
soup = BeautifulSoup(source, 'lxml')
print('Page: %s' % (x + 1))
for figcaption in soup.find_all('figcaption'):
price = figcaption.find('span', {'class': 'new_price'}).text.strip()
name = figcaption.find('a', class_='title').text
link = figcaption.find('a', class_='title')['href']
print('%s\n%s\n%s' % (price, name, link))
csv_writer.writerow([name, link, price])
except:
break
csv_file.close()
If you don't have too many URLs in that text file (urls.txt in my example) the following snippet should do what you want.
import requests
# read all URLs at once
with open("urls.txt", "r") as f:
urls = f.read().splitlines()
# and loop over them
for url in urls:
try:
source = requests.get(url).text
except Exception as e:
print(e)
break
Lets suppose you have a file called input.txt which looks like this
url1
url2
url3
url4
.
.
.
Then we will simply open this input.txt file and then split by a newline('\n'). This should give us a list of urls.
like
['url1','url2','url3']
You can then simply loop through it and crawl the webpages.
Here is a
# crawler.py
import csv
import requests
from bs4 import BeautifulSoup
with open('input.txt','r') as f:
urls = f.read().split() # here we get a list of urls
csv_file = open('cms_scrape.csv', 'w')
csv_writer = csv.writer(csv_file)
csv_writer.writerow(['name', 'link', 'price'])
for url in urls:
try:
source = requests.get(url).text
soup = BeautifulSoup(source, 'lxml')
for figcaption in soup.find_all('figcaption'):
price = figcaption.find('span', {'class': 'new_price'}).text.strip()
name = figcaption.find('a', class_='title').text
link = figcaption.find('a', class_='title')['href']
print('%s\n%s\n%s' % (price, name, link))
csv_writer.writerow([name, link, price])
except Exception as e:
print(e)
break
csv_file.close()

How to put the image files I scraped using Beautiful soup into a list?

This is the code I used to take all the pics from r/pics on reddit and put it into a directory. I want to be able to take the actual files in the directory and put it into a list. Stuck on how to do this.
import requests
from bs4 import BeautifulSoup as bs
import os
url = "https://www.reddit.com/r/pics/"
r = requests.get(url)
data = r.text
soup = bs(data,'lxml')
image_tags = soup.findAll('img')
if not os.path.exists('direct'):
os.makedirs('direct')
os.chdir('direct')
x = 0
for image in image_tags:
try:
url = image['src']
source = requests.get(url)
if source.status_code == 200:
img_path = 'direct-' + str(x) +'.jpg'
with open(img_path, 'wb') as f:
f.write(requests.get(url).content)
f.close()
x+=1
except:
pass
Edit: Here is updated code but still dealing with problem
import requests
from bs4 import BeautifulSoup as bs
import os
url = "https://www.reddit.com/r/drawing"
r = requests.get(url)
data = r.text
soup = bs(data,'lxml')
image_tags = soup.findAll('img')
if not os.path.exists('directory'):
os.makedirs('directory')
os.chdir('directory')
x = 0
mylist = []
for image in image_tags:
url = image['src']
source = requests.get(url)
if source.status_code == 200:
img_path = 'direct-' + str(x) +'.jpg'
with open(img_path, 'wb') as f:
f.write(requests.get(url).content)
mylist.append(img_path)
f.close()
x += 1
print(mylist)
create a list in the beginning of your code:
...
mylist = []
...
then after you get each image, add it to the list
...
img_path = 'direct-' + str(x) +'.jpg'
mylist.append(img_path)
....
EDIT:
I executed your updated code and the image_tags is returning empty - in fact the page returned by
url = "https://www.reddit.com/r/drawing"
r = requests.get(url)
data = r.text
Doesn't contain any images. I guess reddit has some kind of protection to prevent you from fetching images this way.
Try adding print(data) and you will see what I mean
You should use the reddit api so that reddit doesn't limit your requests.

How to copy and replace an element and its children in Python with Beautiful Soup?

So, I have two HTML files and they both have divs in with an ID of htmlbody. I want to check if the htmlbody element in one file is the same as the htmlbody element in another file. If it isn't then I want to copy the htmlbody element and replace it in the file that is different. Please see my code below.
I have tried using the Modifying the Tree Docs here https://www.crummy.com/software/BeautifulSoup/bs4/doc/#append
import codecs
from bs4 import BeautifulSoup
def getMainFile():
#opens and pareses the main html file
main_html = codecs.open("index.html", 'r')
soup = BeautifulSoup(main_html, 'html.parser')
#assignes the HTML content of the main file to a variable.
html_content = soup.find(id="htmlbody")
return html_content
#User Html file
def getUserFile():
user_html = codecs.open("userone.html", 'r')
soup = BeautifulSoup(user_html, 'html.parser')
soup.prettify()
html_content = soup.find(id="htmlbody")
return html_content
#Checks files
if getMainFile() == getUserFile():
print("all good")
else:
new_content = getMainFile()
user_html = codecs.open("userone.html", 'r')
soup = BeautifulSoup(user_html, 'html.parser')
with open("userone.html", "w") as file:
file.write(str(soup.prettify()))

How to save the html source code while navigating to each link

Here is my code
driver = webdriver.Chrome()
path = "/home/winpc/test/python/dup/new"
def get_link_urls(url,driver):
driver.get(url)
url = urllib.urlopen(url)
content = url.readlines()
urls = []
for link in driver.find_elements_by_tag_name('a'):
elem = driver.find_element_by_xpath("//*")
source_code = elem.get_attribute("outerHTML")
test = link.get_attribute('href')
if str(test) != 'None':
file_name=test.rsplit('/')[-1].split('.')[0]
file_name_formated = file_name + "Copy.html"
with open(os.path.join(path, file_name_formated), 'wb') as temp_file:
temp_file.write(source_code.encode('utf-8'))
urls.append(link.get_attribute('href'))
return urls
urls = get_link_urls("http://localhost:8080",driver)
sub_urls = []
for url in urls:
if str(url) != 'None':
sub_urls.extend(get_link_urls(url,driver))
This code properly navigating each and every link but all the time coppiny only the first html page.I need to save the source code of each and every page navigating.saving part is happening using below code:
file_name_formated = file_name + "Copy.html"
with open(os.path.join(path, file_name_formated), 'wb') as temp_file:
temp_file.write(source_code.encode('utf-8'))
First of all you're overwriting URL again and again in the function, so fix that one.
For saving page source through selenium, you can use driver.page_source
Additionally, if you want this code to be faster, consider using requests module.
response = requests.get(url).content

I try to scrape data from url's using following code,it displays an error message:

from urllib2 import urlopen
from BeautifulSoup import BeautifulSoup
from array import array
import csv
url = ['http://cura.free.fr/gauq/902gdA1.html', 'http://cura.free.fr/gauq/902gdA1y.html', 'http://cura.free.fr/gauq/902gdA2.html', 'http://cura.free.fr/gauq/902gdA2y.html', 'http://cura.free.fr/gauq/902gdA3.html']
data = []
m = 0
for i in range(1,len(url)):
if m<url[i]:
page = urlopen(i)
soup = BeautifulSoup(page)
name_box = soup.find("pre")
name = name_box.text.strip()
f = open('output.txt', 'w')
print >> f, 'Filename:', name
f.close()
IndentationError: expected an indented block. This message is displayed for "if statement".
You do need to indent the blocks after the for statement, and if statement.
Try the code below:
from urllib2 import urlopen
from BeautifulSoup import BeautifulSoup
from array import array
import csv
url = [
'http://cura.free.fr/gauq/902gdA1.html',
'http://cura.free.fr/gauq/902gdA1y.html',
'http://cura.free.fr/gauq/902gdA2.html',
'http://cura.free.fr/gauq/902gdA2y.html',
'http://cura.free.fr/gauq/902gdA3.html'
]
data = []
m = 0
for i in range(1,len(url)):
if m<url[i]:
page = urlopen(i)
soup = BeautifulSoup(page)
name_box = soup.find("pre")
name = name_box.text.strip()
f = open('output.txt', 'w')
print >> f, 'Filename:', name
f.close()
As #kuro suggested, you might wanna change the position of the sentences that open and close file. If your purpose is to capture all the text within pre tags in the given URLs, the code below will do that for you.
f = open('output.txt', 'w')
data = []
m = 0
for i in range(1,len(url)):
if m<url[i]:
page = urlopen(i)
soup = BeautifulSoup(page)
name_box = soup.find("pre")
name = name_box.text.strip()
print >> f, 'Filename:', name
f.close()
I re write the code as:
f = open('output.txt', 'w')
for i in url:
page = urlopen(i)
soup = BeautifulSoup(page)
name_box = soup.find("pre")
name = name_box.text.encode('utf-8').strip()
print >> f, 'Filename:', name
f.close()
It worked for me.

Categories