I have been trying to scrape data from xhamster channels for my research using this code
import json
from multiprocessing.dummy import Pool as ThreadPool
from lxml import html
from util import req
def get_channel_urls(url):
r = req(url)
tree = html.fromstring(r.text)
print("Done", url)
return [x.attrib['href'] for x in tree.xpath('//div[#class="item"]/a')]
def write_channel_data(url):
r = req(url)
html_text = r.text
tree = html.fromstring(html_text)
json_data = json.loads(
tree.xpath('//script[#id="initials-script"]/text()')[0].strip().split("window.initials =")[1][:-1].strip())
with open("channel_html/{}".format(json_data['sponsorChannel']['inurl']), 'w', encoding='utf-8') as outfile:
outfile.write(html_text)
print("Written data for:", url)
def main():
letters = '0abcdefghijklmnopqrstuvqxyz'
index_urls = ['https://xhamster.com/channels/all/{}'.format(index_letter) for index_letter in letters]
index_urls.extend(['https://xhamster.com/gay/channels/all/{}'.format(index_letter) for index_letter in letters])
index_urls.extend(['https://xhamster.com/shemale/channels/all/{}'.format(index_letter) for index_letter in letters])
channel_urls = []
for url in index_urls:
channel_urls.extend(get_channel_urls(url))
with open('channel_urls', 'w') as channel_url_backup_file:
channel_url_backup_file.write("\n".join(channel_urls))
# with open('channel_urls') as i: # THIS IS TO READ A PRE-DOWNLOADED URL FILE
# channel_urls = [url.strip() for url in i.read().split()]
with ThreadPool(processes=10) as pool:
pool.map(write_channel_data, channel_urls)
if __name__ == '__main__':
main()
It does work for a while but then i get this error. The error is obviously in the main() funtion but I cant figure how to solve it
IndexError: list out of index
Related
I am writing a program for scraping data from multiple urls using multiprocessing. Here I stored all the URLs in the bonds_url list. It is working and I am getting output but the problem here is that output is in random orders. I want scraped data to be in the same order as the order of URLs in bonds_url.
Is there any solution for that?
from requests_html import HTMLSession
import constants
bonds_url =[]
from multiprocessing import Pool
def f(url):
session = HTMLSession()
response = session.get(url)
try:
data = [i.text.strip() for i in response.html.find(".value") ]
bonds_values.append(float(data[0]))
print(data[0])
except:
data = [i.text.strip() for i in response.html.find("div>span[data-reactid='31']")]
bonds_values.append(float(data[0]))
print(data[0])
if __name__ == '__main__':
with Pool(len(bonds_url)) as p:
p.map(f, bonds_url)
Solution
Change the printS in f to returnS in order to get the results of multiprocessing.Pool.map in order.
from multiprocessing import Pool
from requests_html import HTMLSession
import constants
bonds_url = []
def f(url):
session = HTMLSession()
response = session.get(url)
try:
data = [i.text.strip() for i in response.html.find(".value")]
except:
data = [i.text.strip() for i in response.html.find("div>span[data-reactid='31']")]
return float(data[0])
if __name__ == '__main__':
with Pool(len(bonds_url)) as p:
bond_values = p.map(f, bonds_url)
I wrote a code to scrape title URLs but I'm getting an error while extracting title urls so could you please guide me.
here is my code:
import requests
from bs4 import BeautifulSoup
# import pandas as pd
# import pandas as pd
import csv
def get_page(url):
response = requests.get(url)
if not response.ok:
print('server responded:', response.status_code)
else:
# 1. html , 2. parser
soup = BeautifulSoup(response.text, 'html.parser')
return soup
def get_index_data(soup):
try:
titles_link = soup.find_all('a', class_="body_link_11")
except:
titles_link = []
# urls = [item.get('href') for item in titles_link]
print(titles_link)
def main():
mainurl = "http://cgsc.cdmhost.com/cdm/search/collection/p4013coll8/" \
"searchterm/1/field/all/mode/all/conn/and/order/nosort/page/1"
get_index_data(get_page(mainurl))
if __name__ == '__main__':
main()
If you want to get all the links try this:
def get_page(url):
response = requests.get(url)
if not response.ok:
print('server responded:', response.status_code)
else:
soup = BeautifulSoup(response.text, 'html.parser') # 1. html , 2. parser
return soup
def get_index_data(soup):
try:
titles_link = soup.find_all('a',class_="body_link_11")
except:
titles_link = []
else:
titles_link_output = []
for link in titles_link:
try:
item_id = link.attrs.get('item_id', None) # All titles with valid links will have an item_id
if item_id:
titles_link_output.append("{}{}".format("http://cgsc.cdmhost.com",link.attrs.get('href', None)))
except:
continue
print(titles_link_output)
def main():
mainurl = "http://cgsc.cdmhost.com/cdm/search/collection/p4013coll8/searchterm/1/field/all/mode/all/conn/and/order/nosort/page/1"
get_index_data(get_page(mainurl))
main()
Output:
['http://cgsc.cdmhost.com/cdm/singleitem/collection/p4013coll8/id/2653/rec/1', 'http://cgsc.cdmhost.com/cdm/singleitem/collection/p4013coll8/id/2385/rec/2', 'http://cgsc.cdmhost.com/cdm/singleitem/collection/p4013coll8/id/3309/rec/3', 'http://cgsc.cdmhost.com/cdm/singleitem/collection/p4013coll8/id/2425/rec/4', 'http://cgsc.cdmhost.com/cdm/singleitem/collection/p4013coll8/id/150/rec/5', 'http://cgsc.cdmhost.com/cdm/compoundobject/collection/p4013coll8/id/2501/rec/6', 'http://cgsc.cdmhost.com/cdm/compoundobject/collection/p4013coll8/id/2495/rec/7', 'http://cgsc.cdmhost.com/cdm/singleitem/collection/p4013coll8/id/3672/rec/8', 'http://cgsc.cdmhost.com/cdm/singleitem/collection/p4013coll8/id/3407/rec/9', 'http://cgsc.cdmhost.com/cdm/singleitem/collection/p4013coll8/id/4393/rec/10', 'http://cgsc.cdmhost.com/cdm/singleitem/collection/p4013coll8/id/3445/rec/11', 'http://cgsc.cdmhost.com/cdm/singleitem/collection/p4013coll8/id/3668/rec/12', 'http://cgsc.cdmhost.com/cdm/singleitem/collection/p4013coll8/id/3703/rec/13', 'http://cgsc.cdmhost.com/cdm/singleitem/collection/p4013coll8/id/2952/rec/14', 'http://cgsc.cdmhost.com/cdm/singleitem/collection/p4013coll8/id/2898/rec/15', 'http://cgsc.cdmhost.com/cdm/singleitem/collection/p4013coll8/id/3502/rec/16', 'http://cgsc.cdmhost.com/cdm/singleitem/collection/p4013coll8/id/3553/rec/17', 'http://cgsc.cdmhost.com/cdm/singleitem/collection/p4013coll8/id/4052/rec/18', 'http://cgsc.cdmhost.com/cdm/singleitem/collection/p4013coll8/id/3440/rec/19', 'http://cgsc.cdmhost.com/cdm/singleitem/collection/p4013coll8/id/3583/rec/20']
I am using BeautifulSoup, I am practicing getting website content.
But, duplicate in the output, starting from the second, each repeating itself.
I tried to modify the code in the for loop, but it will still repeat.
#coding:utf-8
import lxml
import json
import re
import requests
from bs4 import BeautifulSoup
def the_url(url):
user_agent = "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1"
headers = {"User-Agent":user_agent}
r = requests.get(url, headers=headers)
return r.text
def get_text(page_html):
the_web = BeautifulSoup(page_html, 'html.parser')
base_url = "https://cn.reuters.com"
list_div = the_web.find('div', {"id": 'chinaNews'})
list_li = list_div.find_all('li')
for t in list_li:
the_dict = {}
a = t.find('a')
excerpt = t.find('div', {"class": 'smalltext'})
if a:
the_dict['link'] = base_url + a.get('href')
the_dict['title'] = a.get_text()
if excerpt:
the_dict['excerpt'] = excerpt.get_text()
result_list.append(the_dict)
def save_to_json(result):
s = json.dumps(result, indent = 4, ensure_ascii = False)
# json file
with open('text.json', 'w', encoding = 'utf-8') as f:
f.write(s)
def main():
for i in range(2):
i = i + 1
url = 'http://cn.mobile.reuters.com/category/chinaNews?p={}'.format(i)
page_html = the_url(url)
get_text(page_html)
save_to_json(result_list)
if __name__ == '__main__':
result_list = []
main()
I want to remove the duplicates in the output.
You can check if the value is already in the dict:
if the_dict and not any(r['link'] == the_dict['link'] for r in result_list):
# No dict with this link exist in the result_list
result_list.append(the_dict)
Here is the following check-test in your get_text method:
def get_text(page_html):
the_web = BeautifulSoup(page_html, 'html.parser')
base_url = "https://cn.reuters.com"
list_div = the_web.find('div', {"id": 'chinaNews'})
list_li = list_div.find_all('li')
for t in list_li:
the_dict = {}
a = t.find('a')
excerpt = t.find('div', {"class": 'smalltext'})
if a:
the_dict['link'] = base_url + a.get('href')
the_dict['title'] = a.get_text()
if excerpt:
the_dict['excerpt'] = excerpt.get_text()
if the_dict and not any(r['link'] == the_dict['link'] for r in result_list):
result_list.append(the_dict)
This is the code I used to take all the pics from r/pics on reddit and put it into a directory. I want to be able to take the actual files in the directory and put it into a list. Stuck on how to do this.
import requests
from bs4 import BeautifulSoup as bs
import os
url = "https://www.reddit.com/r/pics/"
r = requests.get(url)
data = r.text
soup = bs(data,'lxml')
image_tags = soup.findAll('img')
if not os.path.exists('direct'):
os.makedirs('direct')
os.chdir('direct')
x = 0
for image in image_tags:
try:
url = image['src']
source = requests.get(url)
if source.status_code == 200:
img_path = 'direct-' + str(x) +'.jpg'
with open(img_path, 'wb') as f:
f.write(requests.get(url).content)
f.close()
x+=1
except:
pass
Edit: Here is updated code but still dealing with problem
import requests
from bs4 import BeautifulSoup as bs
import os
url = "https://www.reddit.com/r/drawing"
r = requests.get(url)
data = r.text
soup = bs(data,'lxml')
image_tags = soup.findAll('img')
if not os.path.exists('directory'):
os.makedirs('directory')
os.chdir('directory')
x = 0
mylist = []
for image in image_tags:
url = image['src']
source = requests.get(url)
if source.status_code == 200:
img_path = 'direct-' + str(x) +'.jpg'
with open(img_path, 'wb') as f:
f.write(requests.get(url).content)
mylist.append(img_path)
f.close()
x += 1
print(mylist)
create a list in the beginning of your code:
...
mylist = []
...
then after you get each image, add it to the list
...
img_path = 'direct-' + str(x) +'.jpg'
mylist.append(img_path)
....
EDIT:
I executed your updated code and the image_tags is returning empty - in fact the page returned by
url = "https://www.reddit.com/r/drawing"
r = requests.get(url)
data = r.text
Doesn't contain any images. I guess reddit has some kind of protection to prevent you from fetching images this way.
Try adding print(data) and you will see what I mean
You should use the reddit api so that reddit doesn't limit your requests.
Hi I try to scrape the front page images on digg.com, with the follow code. The issue is that 0.jpg to 6.jpg are normal. Starting at 7.jpg to 47.jpg are corrupt. Not sure why.
Here is the code. Github here: https://github.com/kenpeter/py_mm
# os
import os
# http request
import requests
#
import pprint
import time
# import html from lxml
from lxml import html
# global
global_page_num = 0
pp = pprint.PrettyPrinter(indent=4)
# write to file
def download_image(img_urls):
# total img urls
amount = len(img_urls)
# loop
for index, value in enumerate(img_urls, start=0):
# file name
filename = 'img/%s.jpg' % (index)
# dir
os.makedirs(os.path.dirname(filename), exist_ok=True)
print('--- start ---')
print('filename: %s' % filename)
print('Downloading: %s out of %s' % (index, amount))
# open file
with open(filename, 'wb') as f:
# f write
# time.sleep(1)
f.write(requests.get(value).content)
def get_page_number(num):
url = 'http://digg.com'
response = requests.get(url).content
selector = html.fromstring(response)
img_urls = []
img_urls = selector.xpath("//div[#class='digg-story__image--thumb']/a/img/#src")
news_texts = []
news_texts = selector.xpath("//div[#itemprop='description']/text()")
# test
# print('--- something ---')
# pp.pprint(img_urls)
# pp.pprint(news_texts)
download_image(img_urls)
return img_urls
if __name__ == '__main__':
# input, page_number, everything into the var
# page_number = input('Please enter the page number that you want to scrape:')
# global_page_num
# global_page_num = page_number;
# print('hell world!');
page_number = 4 # hardcode
get_page_number(page_number)
The reason why the images are "corrupt" is that the scheme changes within the page and the images start to "hide" in the attribute data-src instead of src which content you grab with your code. See here an example of the source code of the grabbed page with both attributes:
<img
class="digg-story__image-img js--digg-story__image-img lazy-image-img need-offset"
data-src="http://static.digg.com/images/f0b92c2d8a2c4b7f829abbc0e58a408c_2oijd0Z_1_www_large_thumb.jpeg"
src="http://static.digg.com/static/fe/944294/images/x_455x248.png"
width="312"
height="170"
alt=""
/>
In other words you have to check for both attributes src and data-src giving data-src priority over src while creating the list of image URLs.
THIS code does the "trick" and downloads the proper images:
# os
import os
# http request
import requests
#
import pprint
import time
# import html from lxml
from lxml import html
# global
global_page_num = 0
pp = pprint.PrettyPrinter(indent=4)
# write to file
def download_image(img_urls):
# total img urls
amount = len(img_urls)
# loop
for index, value in enumerate(img_urls, start=0):
# file name
filename = 'img/%s.jpg' % (index)
# dir
os.makedirs(os.path.dirname(filename), exist_ok=True)
print('--- start ---')
print('filename: %s' % filename)
print('Downloading: %s out of %s' % (index, amount))
# open file
with open(filename, 'wb') as f:
# f write
# time.sleep(1)
f.write(requests.get(value).content)
def get_page_number(num):
url = 'http://digg.com'
response = requests.get(url).content
selector = html.fromstring(response)
img_urls = []
img_urls_1a = selector.xpath("//div[#class='digg-story__image--thumb']/a/img/#src")
img_urls_1b = [item for item in img_urls_1a if 'x_455x248.png' not in item]
img_urls_2 = selector.xpath("//div[#class='digg-story__image--thumb']/a/img/#data-src")
img_urls = img_urls_1b + img_urls_2
# print(img_urls)
news_texts = []
news_texts = selector.xpath("//div[#itemprop='description']/text()")
# test
# print('--- something ---')
# pp.pprint(img_urls)
# pp.pprint(news_texts)
download_image(img_urls)
return img_urls
if __name__ == '__main__':
# input, page_number, everything into the var
# page_number = input('Please enter the page number that you want to scrape:')
# global_page_num
# global_page_num = page_number;
# print('hell world!');
page_number = 4 # hardcode
get_page_number(page_number)