Extracting link text and writing to file - python

I have crawler that extract links from page only if the link text include given text and I'm writing the output to html file. Its working but I would like to add whole link text next to these links like this - "Junior Java developer - https://www.jobs.cz/junior-developer/" How can I do this?
Thanks
import requests
from bs4 import BeautifulSoup
import re
def jobs_crawler(max_pages):
page = 1
file_name = 'links.html'
while page < max_pages:
url = 'https://www.jobs.cz/prace/praha/?field%5B%5D=200900011&field%5B%5D=200900012&field%5B%5D=200900013&page=' + str(page)
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text)
page += 1
file = open(file_name,'w')
for link in soup.find_all('a', {'class': 'search-list__main-info__title__link'}, text=re.compile('IT', re.IGNORECASE)):
href = link.get('href') + '\n'
file.write(''+ 'LINK TEXT HERE' + '' + '<br />')
print(href)
file.close()
print('Saved to %s' % file_name)
jobs_crawler(5)

This should help.
file.write('''{1}<br />'''.format(link.get('href'), link.text ))

Try this:--
href = link.get('href') + '\n'
txt = link.get_text('href') #will give you text

Related

How to save my links from BeautifulSoup in a text file with python?

I'm learning python and webscraping, It is very cool but I am not able to get what I want.
I'm trying to save products links in a text file to scrape data after.
here is my script, which work correctly (almost) in the console of pycharm :
import bs4 as bs4
from bs4 import BeautifulSoup
import requests
suffixeUrl = '_puis_nblignes_est_200.html'
for i in range(15):
url = 'https://www.topachat.com/pages/produits_cat_est_micro_puis_rubrique_est_w_boi_sa_puis_page_est_' + str(i) + suffixeUrl
response = requests.get(url)
soup = bs4.BeautifulSoup(response.text, 'html.parser')
if response.ok:
print('Page: ' + str(i))
for data in soup.find_all('div', class_='price'):
for a in data.find_all('a'):
link = (a.get('href'))
links = ('https://www.topachat.com/' + link)
print(links) #for getting link
My goal is to save the result of the links variable, line by line in a text file.
I tried this, but something is wrong and I can't get each url :
for link in links:
with open("urls.txt", "a") as f:
f.write(links+"\n")
Please, does someone can help me?
You can try this way.
Just open the file once and write the complete data to it. Opening and closing files inside a loop is not a good thing to do.
import bs4 as bs4
from bs4 import BeautifulSoup
import requests
suffixeUrl = '_puis_nblignes_est_200.html'
with open('text.txt', 'w') as f:
for i in range(15):
url = 'https://www.topachat.com/pages/produits_cat_est_micro_puis_rubrique_est_w_boi_sa_puis_page_est_' + str(i) + suffixeUrl
response = requests.get(url)
soup = bs4.BeautifulSoup(response.text, 'html.parser')
if response.ok:
print('Page: ' + str(i))
for data in soup.find_all('div', class_='price'):
for a in data.find_all('a'):
link = 'https://www.topachat.com/' + a.get('href')
f.write(link+'\n')
Sample output from text.txt
https://www.topachat.com/pages/detail2_cat_est_micro_puis_rubrique_est_w_boi_sa_puis_ref_est_in11020650.html
https://www.topachat.com/pages/detail2_cat_est_micro_puis_rubrique_est_w_boi_sa_puis_ref_est_in10119254.html
https://www.topachat.com/pages/detail2_cat_est_micro_puis_rubrique_est_w_boi_sa_puis_ref_est_in20005046.html
https://www.topachat.com/pages/detail2_cat_est_micro_puis_rubrique_est_w_boi_sa_puis_ref_est_in20002036.html
https://www.topachat.com/pages/detail2_cat_est_micro_puis_rubrique_est_w_boi_sa_puis_ref_est_in20002591.html
https://www.topachat.com/pages/detail2_cat_est_micro_puis_rubrique_est_w_boi_sa_puis_ref_est_in20004309.html
https://www.topachat.com/pages/detail2_cat_est_micro_puis_rubrique_est_w_boi_sa_puis_ref_est_in20002592.html
https://www.topachat.com/pages/detail2_cat_est_micro_puis_rubrique_est_w_boi_sa_puis_ref_est_in10089390.html
.
.
.
Your problem is in for link in links line:
link = (a.get('href'))
links = ('https://www.topachat.com/' + link)
print(links)
for link in links:
with open("urls.txt", "a") as f:
f.write(links+"\n")
Type of links is string and your for loop iterates it letter-by-letter (or characater-by-character). That is why you see a single character at each line in your txt file. You can just remove the for loop and the code will work:
from bs4 import BeautifulSoup
import requests
suffixeUrl = '_puis_nblignes_est_200.html'
for i in range(15):
url = 'https://www.topachat.com/pages/produits_cat_est_micro_puis_rubrique_est_w_boi_sa_puis_page_est_' + str(i) + suffixeUrl
response = requests.get(url)
soup = bs4.BeautifulSoup(response.text, 'html.parser')
if response.ok:
print('Page: ' + str(i))
for data in soup.find_all('div', class_='price'):
for a in data.find_all('a'):
link = (a.get('href'))
links = ('https://www.topachat.com/' + link)
print(links) #for getting link
with open("urls.txt", "a") as f:
f.write(links+"\n")
You can do like this:
import bs4 as bs4
from bs4 import BeautifulSoup
import requests
suffixeUrl = '_puis_nblignes_est_200.html'
url_list = set()
for i in range(15):
url = 'https://www.topachat.com/pages/produits_cat_est_micro_puis_rubrique_est_w_boi_sa_puis_page_est_' + str(i) + suffixeUrl
response = requests.get(url)
soup = bs4.BeautifulSoup(response.text, 'html.parser')
if response.ok:
print('Page: ' + str(i))
for data in soup.find_all('div', class_='price'):
for a in data.find_all('a'):
link = (a.get('href'))
links = ('https://www.topachat.com/' + link)
print(links) #for getting link
url_list.add(links)
with open("urls.txt", "a") as f:
for link in url_list:
f.write(link+"\n")

Filter web links extracted from a html website

I am new to python. I have successfully extracted html links or a tags and entered them into a CSV file.
I am only getting an output of 2, 3 or 22 links, depending on what I try.
The website has website has 244 links and over half are duplicate's. The correct number of links that are not duplicates is 117.
This is what I have so far:
import requests
from bs4 import BeautifulSoup, SoupStrainer
import bs4, csv
search_link = "https://www.census.gov/programs-surveys/popest.html"
r = requests.get(search_link)
raw_html = r.text
soup = BeautifulSoup(raw_html, 'html.parser')
all_links = soup.find_all("a")
r.content
rem_dup = set()
for link in all_links:
hrefs = str(link.get("href"))
if hrefs.startswith('#http'):
rem_dup.add(hrefs[1:])
elif hrefs.endswith('.gov'):
rem_dup.add(hrefs + '/')
elif hrefs.startswith('/'):
rem_dup.add('https://www.census.gov' + hrefs)
else:
rem_dup.add(hrefs)
filename = "Page_Links.csv"
f = open(filename, "w+")
f.write("LINKS\n")
f.write('https://www.census.gov')
f.close()

How to save a list of results to a file

I have scraped a list of pdf links that I want from this website https://www.gmcameetings.co.uk
It is all of the minutes from the local council's committee meetings.
I now need to save all my results into a file so I can then download and read all the pdfs.
How do I go about saving them?
This is my code:
import requests
import urllib.request
import time
from bs4 import BeautifulSoup as bs
url = "https://www.gmcameetings.co.uk/"
r = requests.get(url)
page = r.text
soup = bs(page,'lxml')
folder_location = r'E:\Internship\WORK'
meeting_links = soup.find_all('a', href=True)
for link in meeting_links:
if link['href'].find('/meetings/')>1:
r2 = requests.get(link['href'])
print(link['href'])
page2 = r2.text
soup2 = bs(page2, 'lxml')
date_links = soup2.find_all('a', href=True)
for dlink in date_links:
if dlink['href'].find('/meetings/')>1:
r3 = requests.get(dlink['href'])
print(dlink['href'])
page3 = r3.text
soup3 = bs(page3, 'lxml')
pdf_links = soup3.find_all('a', href=True)
for plink in pdf_links:
if plink['href'].find('minutes')>1:
print("Minutes!")
I need a file that has all the links, which I can then read the pdfs from. Sorry I'm new to coding completely so a bit lost.
import requests
from bs4 import BeautifulSoup as bs
url = "https://www.gmcameetings.co.uk/"
r = requests.get(url)
page = r.text
soup = bs(page,'lxml')
f= open(r"E:\Internship\WORK\links.txt","w+")
n = 0
meeting_links = soup.find_all('a', href=True)
for link in meeting_links:
if link['href'].find('/meetings/')>1:
r2 = requests.get(link['href'])
print(link['href'])
page2 = r2.text
soup2 = bs(page2, 'lxml')
date_links = soup2.find_all('a', href=True)
for dlink in date_links:
if dlink['href'].find('/meetings/')>1:
r3 = requests.get(dlink['href'])
print(dlink['href'])
page3 = r3.text
soup3 = bs(page3, 'lxml')
pdf_links = soup3.find_all('a', href=True)
for plink in pdf_links:
if plink['href'].find('minutes')>1:
n += 1
print("Minutes!")
f.write("Link " + str(n) + ": " + str(plink['href']) +"\n")
f.close()
Just use a regular text file, like this and then write there whaterver output you find required:
with open('Test.txt', 'w') as file:
file.write('Testing output')
Declare file before for loop on write mode and write the link in each iteration and add next line at each addition.
with open('Linkfile.txt', 'w') as f:
for link in meeting_links:
if link['href'].find('/meetings/')>1:
r2 = requests.get(link['href'])
print("link1")
page2 = r2.text
soup2 = bs(page2, 'lxml')
date_links = soup2.find_all('a', href=True)
for dlink in date_links:
if dlink['href'].find('/meetings/')>1:
r3 = requests.get(dlink['href'])
print("link2")
page3 = r3.text
soup3 = bs(page3, 'lxml')
pdf_links = soup3.find_all('a', href=True)
for plink in pdf_links:
if plink['href'].find('minutes')>1:
print(plink['href'])
f.write(plink['href'])
f.write('\n')
for link in meeting_links:
with open('filename.txt', 'a') as fp:
fp.write(link)
We can use Python's context manager which would open the file (allocate resources) & once the operation is performed, it would close the file too (release resources).
with open('links.txt', 'w') as file:
file.write('required content')
We can also specify file type extension as required like links.txt, links.csv etc.

scrape image and title from https://www.open2study.com/courses using python and beatifulsoup

from bs4 import BeautifulSoup
import urllib
r = urllib.urlopen('https://www.open2study.com/courses').read()
soup = BeautifulSoup(r)
links = soup.find('figure').find_all('img', src=True)
for link in links:
txt = open('test.txt' , "w")
link = link["src"].split("src=")[-1]
download_img = urllib.urlopen('https://www.open2study.com/courses')
txt.write(download_img.read())
txt.close()
I need to scrape image and title from this website.
Instead of doing a split you can grab the src directly with beautifulsoup
Use this to get the div that has the title and image in it
for link in soup.find_all("div",attrs={"class" : "courses_adblock_start"}):
Then use this to grab the title and image in each div:
link.find("h2",attrs={"class":"adblock_course_title"}).get_text())
link.find("img", attrs={"class":"image-style-course-logo-subjects-block"}).get("src"))
You also open the page every time in the loop which you want to avoid, you only need to open it once and then use it for the loop like so:
url = "http://www.open2study.com/courses"
page = urllib.request.urlopen(url)
soup = BeautifulSoup(page.read())
for link in soup.find_all("div",attrs={"class" : "courses_adblock_start"}):
try:
print("Title : " + link.find("h2",attrs={"class":"adblock_course_title"}).get_text())
print("Image : " + link.find("img", attrs={"class":"image-style-course-logo-subjects-block"}).get("src"))
except:
print("error")
Here is the new output:
Title : World Music
Image : https://www.open2study.com/sites/default/files/styles/course_logo_subjects_block/public/Course%20Tile_world_music.jpg?itok=CG6pvXHp
Title : Writing for the Web
Image : https://www.open2study.com/sites/default/files/styles/course_logo_subjects_block/public/3_writing_for_web_C_0.jpg?itok=exQApr-1
Something like this?
import urllib
from bs4 import BeautifulSoup
titles = []
images = []
r = urllib.urlopen('https://www.open2study.com/courses').read()
soup = BeautifulSoup(r)
for i in soup.find_all('div', {'class': "courses_adblock_rollover"}):
titles.append(i.h2.text)
for i in soup.find_all(
'img', {
'class': "image-style-course-logo-subjects-block"}):
images.append(i.get('src'))
with open('test.txt', "w") as f:
for i in zip(titles, images):
f.write(i[0].encode('ascii', 'ignore') +
'\n'+i[1].encode('ascii', 'ignore') +
'\n\n')

Opening page using urllib2 - diacritics

I'm trying to open multiple pages using urllib2. The problem is that some pages can't be opened. It returns urllib2.HTTPerror: HTTP Error 400: Bad Request
I'm getting hrefs of this pages from another web page (in the head of the page is charset = "utf-8").
The error is returned only then, when I'm trying to open a page containing 'č','ž' or 'ř' in url.
Here is the code:
def getSoup(url):
req = urllib2.Request(url)
response = urllib2.urlopen(req)
page = response.read()
soup = BeautifulSoup(page, 'html.parser')
return soup
hovienko = getSoup("http://www.hovno.cz/hovna-az/a/1/")
lis = hovienko.find("div", class_="span12").find('ul').findAll('li')
for liTag in lis:
aTag = liTag.find('a')['href']
href = "http://www.hovno.cz"+aTag """ hrefs, I'm trying to open using urllib2 """
soup = getSoup(href.encode("iso-8859-2")) """ here occures errors when 'č','ž' or 'ř' in url """
Do anybody knows, what I have to do to avoid errors?
Thank you
This site is UTF-8. Why you need href.encode("iso-8859-2") ? I have taken the next code from http://programming-review.com/beautifulsoasome-interesting-python-functions/
import urllib2
import cgitb
cgitb.enable()
from BeautifulSoup import BeautifulSoup
from urlparse import urlparse
# print all links
def PrintLinks(localurl):
data = urllib2.urlopen(localurl).read()
print 'Encoding of fetched HTML : %s', type(data)
soup = BeautifulSoup(data)
parse = urlparse(localurl)
localurl = parse[0] + "://" + parse[1]
print "<h3>Page links statistics</h3>"
l = soup.findAll("a", attrs={"href":True})
print "<h4>Total links count = " + str(len(l)) + '</h4>'
externallinks = [] # external links list
for link in l:
# if it's external link
if link['href'].find("http://") == 0 and link['href'].find(localurl) == -1:
externallinks = externallinks + [link]
print "<h4>External links count = " + str(len(externallinks)) + '</h4>'
if len(externallinks) > 0:
print "<h3>External links list:</h3>"
for link in externallinks:
if link.text != '':
print '<h5>' + link.text.encode('utf-8')
print ' => [' + '<a href="' + link['href'] + '" >' + link['href'] + '</a>' + ']' + '</h5>'
else:
print '<h5>' + '[image]',
print ' => [' + '<a href="' + link['href'] + '" >' + link['href'] + '</a>' + ']' + '</h5>'
PrintLinks( "http://www.zlatestranky.cz/pro-mobily/")
The solution was very simple. I should used urllib2.quote().
EDITED CODE:
for liTag in lis:
aTag = liTag.find('a')['href']
href = "http://www.hovno.cz"+urllib2.quote(aTag.encode("utf-8"))
soup = getSoup(href)
Couple of things here.
First, you URIs can't contain non-ASCII. You have to replace them. See this:
How to fetch a non-ascii url with Python urlopen?
Secondly, save yourself a world of pain and use requests for HTTP stuff.

Categories