Python Get Request All Pages Movie list - python

While using below snippet it is not returning values of Page, Total page and data.
Also not returning the value of function "getMovieTitles".
import request
import json
def getMovieTitles(substr):
titles = []
url = "https://jsonmock.hackerrank.com/api/movies/search/?Title={}'.format(substr)"
data = requests.get(url)
print(data)
response = json.loads(data.content.decode('utf-8'))
print(data.content)
for page in range(0, response['total_pages']):
page_response = requests.get("https://jsonmock.hackerrank.com/api/movies/search/?Title={}}&page={}".format(substr, page + 1))
page_content = json.loads(page_response.content.decode('utf-8'))
print ('page_content', page_content, 'type(page_content)', type(page_content))
for item in range(0, len(page_content['data'])):
titles.append(str(page_content['data'][item]['Title']))
titles.sort()
return titles
print(getMovieTitles('Superman'))

You're not formatting the url string correctly.
url = "https://jsonmock.hackerrank.com/api/movies/search/?Title={}'.format(substr)"
format() is a method of string and you've put it inside of the url string, instead do:
url = "https://jsonmock.hackerrank.com/api/movies/search/?Title={}".format(substr)

First, import
import requests
The problem is in your string formatting
' instead of "
url = "https://jsonmock.hackerrank.com/api/movies/search/?Title={}".format(substr)
and one } too much
page_response = requests.get("https://jsonmock.hackerrank.com/api/movies/search/?Title={}&page={}".format(substr, page + 1))

Related

Visible and search URLs for webscraping

When I try to apply filters on the website before webscaping - it yields me to the following URL - https://www.marktplaats.nl/l/auto-s/p/2/#f:10898,10882
However, when I apply it in my script to retrieve href for each and every advertisement, it yields results from this website - https://www.marktplaats.nl/l/auto-s/p/2, completely neglecting 2 of my filters (namely #f:10898,10882).
Can you please advise me what is my problem?
import requests
import bs4
import pandas as pd
frames = []
for pagenumber in range (0,2):
url = 'https://www.marktplaats.nl/l/auto-s/p/'
add_url='/#f:10898,10882'
txt = requests.get(url + str(pagenumber)+add_url)
soup = bs4.BeautifulSoup(txt.text, 'html.parser')
soup_table = soup.find('ul', 'mp-Listings mp-Listings--list-view')
for car in soup_table.findAll('li'):
link = car.find('a')
sub_url = 'https://www.marktplaats.nl/' + link.get('href')
sub_soup = requests.get(sub_url)
soup1 = bs4.BeautifulSoup(sub_soup.text, 'html.parser')
I would suggest that you use their api instead which seems to be open.
If you open the link you will see all the same listings you are searching for (try to find something to format the json, since it will look like just a bunch a text), with the appropriate filters and no need to parse html. You can also modify it easily in request just by changing the headers.
https://www.marktplaats.nl/lrp/api/search?attributesById[]=10898&attributesById[]=10882&l1CategoryId=91&limit=30&offset=0
In code it would look something like this:
def getcars():
url = 'https://www.marktplaats.nl/lrp/api/search'
querystring = {
'attributesById[]': 10898,
'attributesById[]': 10882,
'l1CategoryId': 91,
'limit': 30,
'offset': 0
}
headers = {
}
response = requests.get(url, headers=headers, params=querystring)
x = response.json()
return x
cars = getcars()

Cannot get a specific href out of requests

I'm trying to capture a unique url using Pythons Requests
Source website is https://www.realestate.com.au/property/1-10-grosvenor-rd-terrigal-nsw-2260
Goal Url is http://www.realestate.com.au/sold/property-unit-nsw-terrigal-124570934
When i tried
(Unique_ID,) = (x.text_content() for x in tree.xpath('//a[#class="property-
value__link--muted rui-button-brand property-value__btn-listing"]'))
The CSV returned View Listing
Unless im mistaken, i've done the correct class search, as the href would not be unique enough? Am i supposed to do something different to capture URL's instead of text?
Full code below if required.
Thanks in advance.
import requests
import csv
import datetime
import pandas as pd
import csv
from lxml import html
df = pd.read_excel("C:\Python27\Projects\REA_UNIQUE_ID\\UN.xlsx", sheetname="UN")
dnc = df['Property']
dnc_list = list(dnc)
url_base = "https://www.realestate.com.au/property/"
URL_LIST = []
for nd in dnc_list:
nd = nd.strip()
nd = nd.lower()
nd = nd.replace(" ", "-")
URL_LIST.append(url_base + nd)
text2search = '''The information provided'''
with open('Auctions.csv', 'wb') as csv_file:
writer = csv.writer(csv_file)
for index, url in enumerate(URL_LIST):
page = requests.get(url)
print '\r' 'Scraping URL ' + str(index+1) + ' of ' + str(len(URL_LIST)),
if text2search in page.text:
tree = html.fromstring(page.content)
(title,) = (x.text_content() for x in tree.xpath('//title'))
(Unique_ID,) = (x.text_content() for x in tree.xpath('//a[#class="property-value__link--muted rui-button-brand property- value__btn-listing"]'))
#(sold,) = (x.text_content().strip() for x in tree.xpath('//p[#class="property-value__agent"]'))
writer.writerow([title, Unique_ID])
text_content() allows you to get text only. Try to scrape #href as below
(Unique_ID,) = (x for x in tree.xpath('//a[#class="property-value__link--muted rui-button-brand property-value__btn-listing"]/#href'))

Google news crawler flip pages

continuing on previous work to crawl all news result about query and to return title and url, I am refining the crawler to get all results from all pages in Google News. Current code seems can only return the 1st page Googel news search result. Would be grateful to know how to get all pages results. Many thanks!
my codes below:
import requests
from bs4 import BeautifulSoup
import time
import datetime
from random import randint
import numpy as np
import pandas as pd
query2Google = input("What do you want from Google News?\n")
def QGN(query2Google):
s = '"'+query2Google+'"' #Keywords for query
s = s.replace(" ","+")
date = str(datetime.datetime.now().date()) #timestamp
filename =query2Google+"_"+date+"_"+'SearchNews.csv' #csv filename
f = open(filename,"wb")
url = "http://www.google.com.sg/search?q="+s+"&tbm=nws&tbs=qdr:y" # URL for query of news results within one year and sort by date
#htmlpage = urllib2.urlopen(url).read()
time.sleep(randint(0, 2))#waiting
htmlpage = requests.get(url)
print("Status code: "+ str(htmlpage.status_code))
soup = BeautifulSoup(htmlpage.text,'lxml')
df = []
for result_table in soup.findAll("div", {"class": "g"}):
a_click = result_table.find("a")
#print ("-----Title----\n" + str(a_click.renderContents()))#Title
#print ("----URL----\n" + str(a_click.get("href"))) #URL
#print ("----Brief----\n" + str(result_table.find("div", {"class": "st"}).renderContents()))#Brief
#print ("Done")
df=np.append(df,[str(a_click.renderContents()).strip("b'"),str(a_click.get("href")).strip('/url?q='),str(result_table.find("div", {"class": "st"}).renderContents()).strip("b'")])
df = np.reshape(df,(-1,3))
df1 = pd.DataFrame(df,columns=['Title','URL','Brief'])
print("Search Crawl Done!")
df1.to_csv(filename, index=False,encoding='utf-8')
f.close()
return
QGN(query2Google)
There used to be an ajax api, but it's no longer avaliable .
Still , you can modify your script with a for loop if you want to get a number of pages , or a while loop if you want to get all pages .
Example :
url = "http://www.google.com.sg/search?q="+s+"&tbm=nws&tbs=qdr:y&start="
pages = 10 # the number of pages you want to crawl #
for next in range(0, pages*10, 10) :
page = url + str(next)
time.sleep(randint(1, 5)) # you may need longer than that #
htmlpage = requests.get(page) # you should add User-Agent and Referer #
print("Status code: " + str(htmlpage.status_code))
if htmlpage.status_code != 200 :
break # something went wrong #
soup = BeautifulSoup(htmlpage.text, 'lxml')
... process response here ...
next_page = soup.find('td', { 'class':'b', 'style':'text-align:left' })
if next_page is None or next_page.a is None :
break # there are no more pages #
Keep in mind that google doesn't like bots , you might get a ban .
You could add 'User-Agent' and 'Referer' in headers to simulate a web browser , and use time.sleep(random.uniform(2, 6)) to simulate a human ... or use selenium.
You can also add &num=25 to the end of your query and you'll get back a webpage with that number of results. In this example youll get back 25 google results back.

Add a string into a url python

I am trying to add a string in the middle of an url. Somehow my output looks like this:
http://www.Holiday.com/('Woman',)/Beach
http://www.Holiday.com/('Men',)/Beach
Somehow it should look like this:
http://www.Holiday.com/Woman/Beach
http://www.Holiday.com/Men/Beach
The code which I am using looks like the following:
list = {'Woman','Men'}
url_test = 'http://www.Holiday.com/{}/Beach'
for i in zip(list):
url = url_test.format(str(i))
print(url)
Almost there. Just no need for zip:
items = {'Woman','Men'} # notice that this is a `set` and not a list
url_test = 'http://www.Holiday.com/{}/Beach'
for i in items:
url = url_test.format(i)
print(url)
The purpose of the zip function is to join several collections by the index if the item. When the zip joins the values from each collection it places them in a tuple which it's __str__ representation is exactly what you got.
Here you just want to iterate the items in the collection
You can try this also, And please don't use list as a variable name.
lst = {'Woman','Men'}
url_test = 'http://www.Holiday.com/%s/Beach'
for i in lst:
url = url_test %i
print url
from urllib.request import urlopen
from bs4 import BeautifulSoup as BS
url = "https://www.imdb.com/chart/top?ref_=nv_mv_250"
html = urlopen(url)
url_list = BS(html, 'lxml')
type(url_list)
all_links = url_list.find_all('a', href=re.compile("/title/tt"))
for link in all_links:
print(link.get("href"))
all_urls = link.get("href")
url_test = 'http://www.imdb.com/{}/'
for i in all_urls:
urls = url_test.format(i)
print(urls)
this is the code to scrape the urls of all the 250 movies from the main url.
but the code gives the result as ------
http://www.imdb.com///
http://www.imdb.com/t/
http://www.imdb.com/i/
http://www.imdb.com/t/
http://www.imdb.com/l/
http://www.imdb.com/e/
http://www.imdb.com///
and so on ...
how can i split 'all_urls' using a comma, or how can I make a list of urls in
'all_urls'....

Scraperwiki: how to save data into one cell in table

Here is my code for the scraper that is extracting the URL and corresponding comments from that particular page:
import scraperwiki
import lxml.html
from BeautifulSoup import BeautifulSoup
import urllib2
import re
for num in range(1,2):
html_page = urllib2.urlopen("https://success.salesforce.com/ideaSearch?keywords=error&pageNo="+str(num))
soup = BeautifulSoup(html_page)
for i in range(0,10):
for link in soup.findAll('a',{'id':'search:ForumLayout:searchForm:itemObj2:'+str(i)+':idea:recentIdeasComponent:profileIdeaTitle'}):
pageurl = link.get('href')
html = scraperwiki.scrape(pageurl)
root = lxml.html.fromstring(html)
for j in range(0,300):
for table in root.cssselect("span[id='ideaView:ForumLayout:ideaViewForm:cmtComp:ideaComments:cmtLoop:"+str(j)+":commentBodyOutput'] table"):
divx = table.cssselect("div[class='htmlDetailElementDiv']")
if len(divx)==1:
data = {
'URL' : pageurl,
'Comment' : divx[0].text_content()
}
print data
scraperwiki.sqlite.save(unique_keys=['URL'], data=data)
scraperwiki.sqlite.save(unique_keys=['Comment'], data=data)
When the data is saved to the scraperwiki datastore only the last comment from one URL is put into the table. What I would like is in the table for each URL to have all the comments saved. So, in one column there is the URL and in the second column there are all the comments from that URL, instead of just the last comment, which is what this code ends up with.
As I can see from your code, you put the data in the most inner for loop and assign it a new value every time. So when the for loop ends and goes to the save step, data will contain the last comment. I think you may use:
for i in range(0,10):
for link in soup.findAll('a',{'id':'search:ForumLayout:searchForm:itemObj2:'+str(i)+':idea:recentIdeasComponent:profileIdeaTitle'}):
pageurl = link.get('href')
html = scraperwiki.scrape(pageurl)
root = lxml.html.fromstring(html)
data = {'URL': pageurl, 'Comment':[]}
for j in range(0,300):
for table in root.cssselect("span[id='ideaView:ForumLayout:ideaViewForm:cmtComp:ideaComments:cmtLoop:"+str(j)+":commentBodyOutput'] table"):
divx = table.cssselect("div[class='htmlDetailElementDiv']")
if len(divx)==1:
data['Comment'].append(divx[0].text_content)
scraperwiki.sqlite.save(unique_keys=['URL'], data=data)
scraperwiki.sqlite.save(unique_keys=['Comment'], data=data)

Categories