How I do implement multithread in my web scraper? - python

I am currently working on a project that requires me to extract data from hundreds of pages. However, I notice that the whole extraction is taking too long since the scraper has to process around 800+ pages. I have read about multiprocessing which I believe it can speed things up but I don't really know how to integrate it to my current code.
from bs4 import BeautifulSoup as soup
import requests
import pandas as pd
import time
final_data = []
for i in range(1,8271,10):
url = (f'https://www.fca.org.uk/news/search-results?np_category=warnings&start={i}')
req = requests.get(url)
start = time.process_time()
page_html = req.content
page_soup = soup(page_html, "lxml")
data = page_soup.find_all('li', class_='search-item')
print(f'Processing {url}')
for x in data:
list = {}
list['name'] = x.find('a','search-item__clickthrough').text.strip()
try:
list['published_date']=x.find('span','meta-item published-date').text
except:
list['published_date'] = 'None'
list['modified_date']=x.find('span','meta-item modified-date').text
final_data.append(list)
df = pd.DataFrame(final_data)
TodaysDate = time.strftime("%Y%m%d")
csvfilename = TodaysDate + "_FCA Macro.csv"
df.to_csv(csvfilename, encoding="utf-8-sig")

from bs4 import BeautifulSoup as soup
import requests
import pandas as pd
import time
import threading
final_data = []
def scrape(i):
url = (f'https://www.fca.org.uk/news/search-results?np_category=warnings&start={i}')
req = requests.get(url)
start = time.process_time()
page_html = req.content
page_soup = soup(page_html, "lxml")
data = page_soup.find_all('li', class_='search-item')
print(f'Processing {url}')
for x in data:
list = {}
list['name'] = x.find('a','search-item__clickthrough').text.strip()
try:
list['published_date']=x.find('span','meta-item published-date').text
except:
list['published_date'] = 'None'
list['modified_date']=x.find('span','meta-item modified-date').text
final_data.append(list)
for i in range(1,8271,10):
threading.Thread(target=scrape, args=(i,)).start()
df = pd.DataFrame(final_data)
TodaysDate = time.strftime("%Y%m%d")
csvfilename = TodaysDate + "_FCA Macro.csv"
df.to_csv(csvfilename, encoding="utf-8-sig")
See https://realpython.com/intro-to-python-threading/ for more documentation.
Just use a for loop to create new threads, and pass in i for each new thread created. .start() is needed to start the thread.

Related

Iterate Over URLs Using BeautifulSoup

I have written some code to gather URLs for each race course from https://www.horseracing.net/racecards. I have also written some code to scrape data from each race course page.
Each bit of code works as it should but I am having trouble creating a for loop to loop through all the race course URLs.
Here's the code to scrape the course URLs:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
todays_racecard_url = 'https://www.horseracing.net/racecards'
base_url = "https://www.horseracing.net"
reqs = requests.get(todays_racecard_url)
content = reqs.text
soup = BeautifulSoup(content, 'html.parser')
course_urls = []
for h in soup.findAll('h3'):
a = h.find('a')
try:
if 'href' in a.attrs:
card_url = urljoin(base_url, a.get('href'))
course_urls.append(card_url)
except:
pass
for card_url in course_urls:
print(card_url)
And here's the code to scrape the pages:
import requests
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
url = "https://www.horseracing.net/racecards/fontwell/13-05-21"
results = requests.get(url)
soup = BeautifulSoup(results.text, "html.parser")
date = []
course = []
time = []
runner = []
tips = []
tipsters = []
runner_div = soup.find_all('div', class_='row-cell-right')
for container in runner_div:
runner_name = container.h5.a.text
runner.append(runner_name)
tips_no = container.find('span', class_='tip-text number-tip').text if container.find('span', class_='tip-text number-tip') else ''
tips.append(tips_no)
tipster_names = container.find('span', class_='pointers-text currency-text').text if container.find('span', class_='pointers-text currency-text') else ''
tipsters.append(tipster_names)
newspaper_tips = pd.DataFrame({
'Runners': runner,
'Tips': tips,
'Tipsters': tipsters,
})
newspaper_tips['Tipsters'] = newspaper_tips['Tipsters'].str.replace(' - ', '')
newspaper_tips.to_csv('NewspaperTips.csv', mode='a', header=False, index=False)
How do I join them to get the result I'm looking for?
It could be combined as follows:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
todays_racecard_url = 'https://www.horseracing.net/racecards'
base_url = "https://www.horseracing.net"
req = requests.get(todays_racecard_url)
soup_racecard = BeautifulSoup(req.content, 'html.parser')
df = pd.DataFrame(columns=['Runners', 'Tips', 'Tipsters'])
for h in soup_racecard.find_all('h3'):
a = h.find('a', href=True) # only find tags with href present
if a:
url = urljoin(base_url, a['href'])
print(url)
results = requests.get(url)
soup_url = BeautifulSoup(results.text, "html.parser")
for container in soup_url.find_all('div', class_='row-cell-right'):
runner_name = container.h5.a.text
tips_no = container.find('span', class_='tip-text number-tip').text if container.find('span', class_='tip-text number-tip') else ''
tipster_names = container.find('span', class_='pointers-text currency-text').text if container.find('span', class_='pointers-text currency-text') else ''
row = [runner_name, tips_no, tipster_names]
df.loc[len(df)] = row # append the new row
df['Tipsters'] = df['Tipsters'].str.replace(' - ', '')
df.to_csv('NewspaperTips.csv', index=False)
Giving you a CSV starting:
Runners,Tips,Tipsters
Ajrad,2,NEWMARKET
Royal Tribute,1,The Times
Time Interval,1,Daily Mirror
Hemsworth,1,Daily Express
Ancient Times,,
Final Watch,,
Hala Joud,,
May Night,1,The Star
Tell'Em Nowt,,

Error during a loop to extract content from an scraped link using BeautifulSoup

I've been working on this web scraper for a while and trying to get the body content of different links of an online newsletter. Therefore, if I breakdown the code for the second loop and run it separately, it will return the correct results, however, if the same part is put inside a loop in the bigger script, it will return the error "IndexError: list index out of range".
This is the script that 2nd loop returns the error (UPDATED):
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
import pandas as pd
def pages(myurl):
# opening up connection, grabbing the page
uClient = uReq(myurl)
page_html = uClient.read()
uClient.close()
# html parsing
page_soup = soup(page_html, "html.parser")
dt = []
ttl = []
name = []
body = []
source = []
# grabs each newsletter subjects
titular = page_soup.findAll("div",{"class":"col-md-9 col-sm-9 col-xs-9"})
titular[0]
tit1 = titular[0]
fixed = 'https://www.df.cl/noticias/site/tax/port/all'
for tit1 in titular:
date = tit1.span.text
dt.append(date)
title = tit1.h2.a.text
ttl.append(title)
link = tit1.h2.a["href"].strip()
source.append(fixed + link)
df = pd.DataFrame(dt, columns =['date'])
df['title'] = ttl
df['link'] = source
for link in df['link']:
new_link = fixed + link
page = uReq(new_link)
page_html_1 = page.read()
page.close()
page_soup = soup(page_html_1, "html.parser")
content = page_soup.findAll("div",{"class":"entry cuerpo-noticias CUERPO"})
content[0].text
cont1 = content[0].text
body.append(cont1)
df['content'] = body
print(df)
#df.to_csv(u'C:/Users/snpw9/Desktop/Scripts/sample_scrap.csv', mode='a', header=False, index=False)
pages('https://www.df.cl/noticias/site/tax/port/all/taxport_3_230__1.html') #Banca y Fintech
pages('https://www.df.cl/noticias/site/tax/port/all/taxport_3_20__1.html') #Bolsa y Monedas
pages('https://www.df.cl/noticias/site/tax/port/all/taxport_3_226__1.html') #Pensiones
pages('https://www.df.cl/noticias/site/tax/port/all/taxport_3_228__1.html') #Seguros
It would be very helpful to make this part work, hopefully, with your help!
The second script without the loop (which work properly):
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
myurl = 'https://www.df.cl/noticias/site/tax/port/all/noticias/mercados/banca-fintech/bancoestado-
destina-90-millones-para-los-gastos-de-internet-de-sus/2020-07-07/152240.html'
#def pages(myurl):
# opening up connection, grabbing the page
uClient = uReq(myurl)
page_html = uClient.read()
uClient.close()
#html parsing
page_soup = soup(page_html, "html.parser")
#grabs each newsletter subjects
content = page_soup.findAll("div",{"class":"entry cuerpo-noticias CUERPO"})
cont1 = content[0].text
print(cont1)

Save and Scraping multiple pages with BeautifulSoup and pandas

I tested my code with jupiter notebook with this code
...
rname = soup.find('p', 'con_tx')
#rnamelis = rname.findAll('p')
rname
from urllib.request import urljoin
story=[]
#review_text = lis[0].find('p').getText()
#list_soup =soup.find_all('p', 'con_tx')
story=rname.getText()
story
and it worked well.
(result) '전 여친에 ...'
But when I tried to scrape multiple pages
from bs4 import BeautifulSoup
from urllib.request import urlopen
from urllib.request import urljoin
import pandas as pd
import numpy as np
import requests
base_url = 'https://movie.naver.com/movie/bi/mi/basic.nhn?code='
pages =['177374','164102']
url = base_url + pages[0]
story = []
for n in pages:
# Create url
url = base_url + n
# Parse data using BS
print('Downloading page %s...' % url)
res = requests.get(url)
res.raise_for_status()
html = urlopen(url)
soup = BeautifulSoup(html, "html.parser")
#print(soup.find('p', 'con_tx'))
rname = soup.find('p', 'con_tx')
story=rname.getText()
data = {story}
df = pd.DataFrame(data)
df.head()
df.to_csv('./moviestory.csv', sep=',', encoding='EUC-KR')
An error message came out.
ValueError: DataFrame constructor not properly called!
How do I fix my code?
Crawling area
Not sure what you are trying to do, but one thing I'm noticing is you are overwriting your dataframe each time. Also don;t know why you initialise story as a list, and then store it as a dictionary in the loop.
from bs4 import BeautifulSoup
import pandas as pd
import requests
base_url = 'https://movie.naver.com/movie/bi/mi/basic.nhn?code='
pages =['177374','164102']
df = pd.DataFrame()
for n in pages:
# Create url
url = base_url + n
# Parse data using BS
print('Downloading page %s...' % url)
res = requests.get(url)
soup = BeautifulSoup(res.text, "html.parser")
rname = soup.find('p', 'con_tx')
story=rname.getText()
data = [story]
df = df.append(pd.DataFrame(data), sort=True).reset_index(drop=True)
df.to_csv('./moviestory.csv', sep=',', encoding='EUC-KR')

Web Scraping Real-Time

I am currently web scraping using BeautifulSoup which is fetched and wrote in xml as shown in the code below, I am just wondering what could I do to make it real time as the website updates every 5 mins.
import csv
import requests
from bs4 import BeautifulSoup
url = 'http://www.dublincity.ie/dublintraffic/cpdata.xml?1543254514266'
res = requests.get(url)
soup = BeautifulSoup(res.content,"xml")
data = []
for item in soup.select("carpark"):
ditem = {}
ditem['Name'] = item.get("name")
ditem['Spaces'] = item.get("spaces")
data.append(ditem)
with open("xmldocs.csv","w",newline="") as f:
writer = csv.DictWriter(f,["Name","Spaces"])
writer.writeheader()
for info in data:
writer.writerow(info)
You can use a while loop, then at the end you can add a sleep for 5 mins.
Using your example this would be:
import csv
import requests
from bs4 import BeautifulSoup
import time
while True:
url = 'http://www.dublincity.ie/dublintraffic/cpdata.xml?1543254514266'
res = requests.get(url)
soup = BeautifulSoup(res.content,"xml")
data = []
for item in soup.select("carpark"):
ditem = {}
ditem['Name'] = item.get("name")
ditem['Spaces'] = item.get("spaces")
data.append(ditem)
with open("xmldocs.csv","w",newline="") as f:
writer = csv.DictWriter(f,["Name","Spaces"])
writer.writeheader()
for info in data:
writer.writerow(info)
time.sleep(5 * 60)

Python scraping, saving to csv file. However, not all variables are in the csv

I have a problem with writing the scraped data to a csv file. While the pages are loaded and the first part of the scripts works, the writing to csv causes a problem.
The question I have is: How can I write the data; Name, Home State and Backer state to a csv file? The following code only writes Category to the csv file.
Code:
import urllib.request
from bs4 import BeautifulSoup
from selenium import webdriver
import pandas as pd
import time
from datetime import datetime
from collections import OrderedDict
import re
browser = webdriver.Firefox()
browser.get('https://www.kickstarter.com/discover?ref=nav')
categories = browser.find_elements_by_class_name('category-container')
category_links = []
for category_link in categories:
#Each item in the list is a tuple of the category's name and its link.
category_links.append((str(category_link.find_element_by_class_name('f3').text),
category_link.find_element_by_class_name('bg-white').get_attribute('href')))
scraped_data = []
now = datetime.now()
counter = 1
for category in category_links:
browser.get(category[1])
browser.find_element_by_class_name('sentence-open').click()
time.sleep(2)
browser.find_element_by_id('category_filter').click()
time.sleep(2)
for i in range(27):
try:
time.sleep(2)
browser.find_element_by_id('category_'+str(i)).click()
time.sleep(2)
except:
pass
projects = []
for project_link in browser.find_elements_by_class_name('clamp-3'):
projects.append(project_link.find_element_by_tag_name('a').get_attribute('href'))
for counter, project in enumerate(projects):
page1 = urllib.request.urlopen(projects[counter])
soup1 = BeautifulSoup(page1, "lxml")
page2 = urllib.request.urlopen(projects[counter].split('?')[0]+'/community')
soup2 = BeautifulSoup(page2, "lxml")
time.sleep(2)
print(str(counter)+': '+project+'\nStatus: Started.')
project_dict = OrderedDict()
project_dict['Category'] = category[0]
browser.get(project)
project_dict['Name'] = soup1.find(class_='type-24 type-28-sm type-38-md navy-700 medium mb3').text
project_dict['Home State'] = soup1.find(class_='nowrap navy-700 flex items-center medium type-12').text
try:
project_dict['Backer State'] = soup2.find(class_='location-list-wrapper js-location-list-wrapper').text
except:
pass
print('Status: Done.')
counter+=1
scraped_data.append(project_dict)
later = datetime.now()
diff = later - now
print('The scraping took '+str(round(diff.seconds/60.0,2))+' minutes, and scraped '+str(len(scraped_data))+' projects.')
df = pd.DataFrame(scraped_data)
df.to_csv('kickstarter-data1.csv')

Categories