I try to crawl this link by sending json requests. My first request would be :
parameters1 = {'ticker':'XOM', 'countryCode':'US',
'dateTime':'', 'docId':'1222737422 ',
'docType':'806','sequence':'e5a00f51-8821-4fbc-8ac6-e5f64b5eb0f2',
'messageNumber':'','count':'10',
'channelName':'/news/latest/company/us/xom', 'topic':'',
'_':'' }
firstUrl = "http://www.marketwatch.com/news/headline/getheadlines"
html1 = requests.get(firstUrl, params = parameters1, headers = header)
html_json1=(json.loads(html1.text))
for sending the next requests, I have to extract docId from the corresponding HTML and add it to the new parameters. I don't know how to do that. Do you have any idea how to get new HTML frile after sending json requestes?
import requests
import json
from bs4 import BeautifulSoup
def main():
html_url = 'http://www.marketwatch.com/investing/stock/xom'
resp = requests.get(html_url)
if resp.status_code != 200:
raise Exception("http request failed: %s" % resp)
soup = BeautifulSoup(resp.text, 'lxml')
# get value of `data-uniqueid` from last news node of 'MarketWatch News on XOM'
li_node = soup.select("#mwheadlines > div.headlinewrapper > ol > li[data-uniqueid]")[-1]
unique_id = li_node['data-uniqueid']
print('got unique_id=%r, from %r' % (unique_id, li_node.text.replace('\n', ' ').strip()))
baseUrl = 'http://www.marketwatch.com/news/headline/getheadlines'
parameters = {
'ticker':'XOM',
'countryCode':'US',
'docType':'806',
'docId': '', # (Optional) initial value extract from HTML page
'sequence':'e5a00f51-8821-4fbc-8ac6-e5f64b5eb0f2', # initial value extract from HTML page
'messageNumber':'8589', # initial value extract from HTML page
'count':'10',
'channelName': '/news/latest/company/us/xom',
}
parameters.update(extract_page_params(unique_id))
while True:
resp = requests.get(baseUrl, params = parameters)
data = json.loads(resp.text) # array of size 10
first = data[0] # get first item of array
last = data[-1] # get last item of array
print("\ngot %d data, url: %s" % (len(data), resp.url))
print("\tfirst: %-42s, %s" % (first['UniqueId'], first['SeoHeadlineFragment']))
print("\t last: %-42s, %s" % (last['UniqueId'], last['SeoHeadlineFragment']))
print("")
uid = last['UniqueId'] # get value of UniqueId from dict object `last`
parameters.update(extract_page_params(uid))
input("press <enter> to get next")
def extract_page_params(uid):
sequence = ''
messageNumber = ''
docId = ''
if ':' in uid: # if the symbol ':' in string `uid`
# uid looks like `e5a00f51-8821-4fbc-8ac6-e5f64b5eb0f2:8499`
# so split it by ':'
sequence, messageNumber = uid.split(':')
else:
docId = uid
return {
'sequence': sequence,
'messageNumber': messageNumber,
'docId': docId,
}
if __name__ == '__main__':
main()
This is my code to solve your problem.
Since you are new to programming, i have added some comments.
You could directly copy and run with python version 3. (2 should work either)
You can use Beautiful Soup to extract data from html.It is a python library for extracting data from HTML.
Related
I am making a python script using API of a free test automation website called TestProject.
Link to their API: https://api.testproject.io/docs/v2/
Basically what i want to do is grab pdf of reports of all tests and save them somewhere.
But to make the GET request to do that i first need projectID and jobID which i already wrote functions getting them and saving them in the array.
But now i have a problem where its looping through both lists and not using correct projectID and jobID and its throwing errors because it does not exist.
So what i need is something to check if jobID is in projectID so that way i can make a GET request to get all the executionID's to get the PDF of the report.
I am kinda new to programming so i would love any help i can get. If anyone has any better solutions please feel free to let me know.
My script:
import requests
import json
import csv
from datetime import datetime
from jsonpath_ng import jsonpath, parse
API_key = 'api_key'
headers = {'Authorization':'{}'.format(API_key)}
list_projectId = []
list_jobId = []
list_executionId = []
ParseData_projectId = parse('$..id')
ParseData_jobId = parse('$..id')
ParseData_executionId = parse('$..id')
def parsing (response,ParseData,list_data):
# parses data and appends it to the list
Data = json.loads(response)
Parsaj = ParseData
Podatki = Parsaj.find(Data)
for i in range(0, len(Podatki)):
vrednost = Podatki[i].value
list_data.append(vrednost)
def projectId():
# gets all projectId's and saves them in list_projectId
url = 'https://api.testproject.io/v2/projects?_start=0'
response = requests.get(url,headers=headers)
response_json = response.json()
converted = json.dumps(response_json)
parsing(converted,ParseData_projectId,list_projectId)
def jobId():
# gets all jobId's and saves them in list_jobId
for i in range(0, len(list_projectId)):
id = list_projectId[i]
url = 'https://api.testproject.io/v2/projects/{}'.format(id) + '/jobs?onlyScheduled=false&_start=0'
response = requests.get(url,headers=headers)
response_json = response.json()
converted = json.dumps(response_json)
parsing(converted,ParseData_jobId,list_jobId)
def executionId():
# Their API link:
# https://api.testproject.io/v2/projects/{projectId}/jobs/{jobId}/reports?_start=0
# the for loop below does not work here is where i need the help:
for i in range(0, len(list_projectId)):
project_id = list_projectId[i]
job_id = list_jobId[i]
url = 'https://api.testproject.io/v2/projects/{}'.format(project_id) + '/jobs/{}'.format(job_id) + '/reports?_start=0'
response = requests.get(url,headers=headers)
response_json = response.json()
converted = json.dumps(response_json)
parsing(converted,ParseData_executionId,list_executionId)
projectId()
print("----------LIST PROJECT ID: ----------")
print(list_projectId)
print("")
jobId()
print("----------LIST JOB ID: ----------")
print(list_jobId)
executionId()
print("----------LIST EXECUTION ID: ----------")
print(list_executionId)
you have to use 'in' operator to check the value exist in the list data structure.
I am trying to run this python script made by trufflehog to use their open-source scanner for scanning all repo's of an account or organization at once.
Does anyone know how i can put my orgname and access token in the url without messing up the rest?
def get_org_repos(orgname, page):
response = requests.get(url='https://api.github.com/users/' + orgname + '/repos?page={}'.format(page))
json = response.json()
if not json:
return None
for item in json:
if item['fork'] == False:
print('searching ' + item["html_url"])
results = truffleHog.find_strings(item["html_url"], do_regex=True, custom_regexes=rules, do_entropy=False, max_depth=100000)
for issue in results["foundIssues"]:
d = loads(open(issue).read())
d['github_url'] = "{}/blob/{}/{}".format(item["html_url"], d['commitHash'], d['path'])
d['github_commit_url'] = "{}/commit/{}".format(item["html_url"], d['commitHash'])
d['diff'] = d['diff'][0:200]
d['printDiff'] = d['printDiff'][0:200]
print(dumps(d, indent=4))
get_org_repos(orgname, page + 1)
get_org_repos("insertOrgName", 1)
So far I have tried a few variations of things like
requests.get(url='https://api.github.com/users/myOrg/repos?access_token=xyz?page={}'.format(page))
Firstly I would use f-strings to make it more readable, secondly I would pass in GET parameters using built in requests functionality.
params = {'page':page, 'access_token'=access_token}
def get_org_repos(orgname, page):
response = requests.get(f'https://api.github.com/users/{orgname}/repos', params=params)
json = response.json()
if not json:
return None
for item in json:
if item['fork'] == False:
print('searching ' + item["html_url"])
results = truffleHog.find_strings(item["html_url"], do_regex=True, custom_regexes=rules, do_entropy=False, max_depth=100000)
for issue in results["foundIssues"]:
d = loads(open(issue).read())
d['github_url'] = f'{html_url}/blob/{commitHash}/{path}'
d['github_commit_url'] = f'{html_url}/commit/{commitHash}'
d['diff'] = d['diff'][0:200]
d['printDiff'] = d['printDiff'][0:200]
print(dumps(d, indent=4))
get_org_repos(orgname, page + 1)
get_org_repos("insertOrgName", 1)
I'm using the code shown below in order to retrieve papers from arXiv. I want to retrieve papers that have words "machine" and "learning" in the title. The number of papers is large, therefore I want to implement a slicing by year (published).
How can I request records of 2020 and 2019 in search_query? Please notice that I'm not interested in post-filtering.
import urllib.request
import time
import feedparser
# Base api query url
base_url = 'http://export.arxiv.org/api/query?';
# Search parameters
search_query = urllib.parse.quote("ti:machine learning")
start = 0
total_results = 5000
results_per_iteration = 1000
wait_time = 3
papers = []
print('Searching arXiv for %s' % search_query)
for i in range(start,total_results,results_per_iteration):
print("Results %i - %i" % (i,i+results_per_iteration))
query = 'search_query=%s&start=%i&max_results=%i' % (search_query,
i,
results_per_iteration)
# perform a GET request using the base_url and query
response = urllib.request.urlopen(base_url+query).read()
# parse the response using feedparser
feed = feedparser.parse(response)
# Run through each entry, and print out information
for entry in feed.entries:
#print('arxiv-id: %s' % entry.id.split('/abs/')[-1])
#print('Title: %s' % entry.title)
#feedparser v4.1 only grabs the first author
#print('First Author: %s' % entry.author)
paper = {}
paper["date"] = entry.published
paper["title"] = entry.title
paper["first_author"] = entry.author
paper["summary"] = entry.summary
papers.append(paper)
# Sleep a bit before calling the API again
print('Bulk: %i' % 1)
time.sleep(wait_time)
According to the arXiv documentation, there is no published or date field available.
What you can do is to sort the results by date (by adding &sortBy=submittedDate&sortOrder=descending to your query parameters) and stop making requests when you reach 2018.
Basically your code should be modified like this:
import urllib.request
import time
import feedparser
# Base api query url
base_url = 'http://export.arxiv.org/api/query?';
# Search parameters
search_query = urllib.parse.quote("ti:machine learning")
i = 0
results_per_iteration = 1000
wait_time = 3
papers = []
year = ""
print('Searching arXiv for %s' % search_query)
while (year != "2018"): #stop requesting when papers date reach 2018
print("Results %i - %i" % (i,i+results_per_iteration))
query = 'search_query=%s&start=%i&max_results=%i&sortBy=submittedDate&sortOrder=descending' % (search_query,
i,
results_per_iteration)
# perform a GET request using the base_url and query
response = urllib.request.urlopen(base_url+query).read()
# parse the response using feedparser
feed = feedparser.parse(response)
# Run through each entry, and print out information
for entry in feed.entries:
#print('arxiv-id: %s' % entry.id.split('/abs/')[-1])
#print('Title: %s' % entry.title)
#feedparser v4.1 only grabs the first author
#print('First Author: %s' % entry.author)
paper = {}
paper["date"] = entry.published
year = paper["date"][0:4]
paper["title"] = entry.title
paper["first_author"] = entry.author
paper["summary"] = entry.summary
papers.append(paper)
# Sleep a bit before calling the API again
print('Bulk: %i' % 1)
i += results_per_iteration
time.sleep(wait_time)
for the "post-filtering" approach, once enough results are collected, I'd do something like this:
papers2019 = [item for item in papers if item["date"][0:4] == "2019"]
I write a python script to do GET and PUT method in zendesk API and successfully get the data I wanted and do some updates to the tickets.
below method resulting this ticket number "6442" and put method is intended to remove the tags
from urllib.parse import urlencode
import json
import requests
# Set the credentials
credentials = 'some email', 'some password'
session = requests.Session()
session.auth = credentials
# Set the GET parameters
params_noreply_window = {
'query': 'type:ticket tags:test status<closed',
}
params_oustide_businesshour = {
'query': 'type:ticket tags:send_whatsapp_obh status:new',
}
url_search1 = 'https://propertypro.zendesk.com/api/v2/search.json?' + \
urlencode(params_noreply_window)
url_search2 = 'https://propertypro.zendesk.com/api/v2/search.json?' + \
urlencode(params_oustide_businesshour)
response_noreply_window = session.get(url_search1)
response_oustide_businesshour = session.get(url_search2)
# -----------------------------------------------------------------------------
if response_noreply_window.status_code != 200 | response_oustide_businesshour.status_code != 200:
print('Status 1:', response_noreply_window.status_code + 'Status 2:', response_oustide_businesshour.status_code,
'Problem with the request. Exiting.')
exit()
# Print the subject of each ticket in the results
data_noreply_window = response_noreply_window.json()
data_oustide_businesshour = response_oustide_businesshour.json()
# Ticket to update
# Create a list containing the values of the id field
# for each dictionary that is an element of the list data
id_merged1 = [result['id'] for result in data_noreply_window['results']]
print(type(id_merged1))
print(id_merged1)
id_merged2 = [result['id'] for result in data_oustide_businesshour['results']]
print(type(id_merged2))
print(id_merged2)
# Join value of list by using comma separated
id_merged1_joined = ','.join(map(str, id_merged1))
print(id_merged1_joined)
id_merged2_joined = ','.join(map(str, id_merged2))
print(id_merged2_joined)
# Package the data in a dictionary matching the expected JSON
data_comment1 = {"ticket":
{
"remove_tags": ["test"]
}
}
data_comment2 = {"ticket":
{
"remove_tags": ["send_whatsapp_obh"]
}
}
# Encode the data to create a JSON payload
payload1 = json.dumps(data_comment1)
payload2 = json.dumps(data_comment2)
print("**Start**")
# Set the request parameters
url_put_comments1 = 'https://propertypro.zendesk.com/api/v2/tickets/update_many.json?' +\
'ids=' + id_merged1_joined
url_put_comments2 = 'https://propertypro.zendesk.com/api/v2/tickets/update_many.json?' +\
'ids=' + id_merged2_joined
user = 'some email'
pwd = 'some password'
headers = {'content-type': 'application/json'}
# Do the HTTP put request
response_request_noreply = requests.put(url_put_comments1, data=payload1,
auth=(user, pwd), headers=headers)
response_request_obh = requests.put(url_put_comments2, data=payload2,
auth=(user, pwd), headers=headers)
# Check for HTTP codes other than 200
if response_request_noreply.status_code != 200 | response_request_obh.status_code != 200:
print('Status 1:', response_request_noreply.status_code +
'Status 1:', response_request_obh.status_code,
'Problem with the request. Exiting.')
exit()
# Report success
print('Successfully added comment to tickets')
However, after running my python code and do another GET method, the same ticket number still appears and I need to wait in random time to get the result I intend which is return 'null' since I have updated the ticket by using PUT method.
Can anyone explain me how does the Zendesk API works? and my apology for my incorrect sentences in explaining my concern.
I have this code, which scrapes the Hacker News website with beautifulsoup4 and I am looking for a way to save the results into a Dataframe using Pandas. I have already imported pandas in the below code but I do not know how I can save the results into a DataFrame. It only scrapes the most favored Hacker News post now but it can be changed.
import pandas as pd
from requests import get
from requests.exceptions import RequestException
from contextlib import closing
from bs4 import BeautifulSoup
from math import ceil
import json, sys, argparse, validators
MAX_NUM_POSTS = 100
class HackerNewsScraper:
URL = 'https://news.ycombinator.com/news'
def __init__(self, posts):
self._total_posts = posts
self._total_pages = int(ceil(posts/30))
self._stories = []
def scrape_stories(self):
"""
Fetches all HTML data.
Each page is limited to 30 stories, this function will ensure enough pages are fetched.
"""
page = 1
while(page <= self._total_pages): # Makes sure to visit sufficient amount of pages
url = '{}?p={}'.format(self.URL, page)
html = get_html(url)
self.parse_stories(html)
page += 1
def parse_stories(self, html):
"""
Given a BeautifulSoup nested data structure, html. parse_stories(html) will parse the data and select the desired fields.
After getting title, uri, author, comments, points, and rank, it will save them in dictionary form in self._stories.
"""
for storytext, subtext in zip(html.find_all('tr', {'class': 'athing'}),
html.find_all('td', {'class': 'subtext'})):
storylink = storytext.find_all('a',{'class':'storylink'})
sublink = subtext.select('a')
# All requested data being saved in the dictionary story below
TITLE = storylink[0].text.strip()
LINK = storylink[0]['href']
AUTHOR = sublink[0].text
COMMENTS = sublink[-1].text
POINTS = subtext.select('span')[0].text
RANK = storytext.select('span.rank')[0].text.strip('.')
story = {
'title' : TITLE,
'uri' : LINK,
'author' : AUTHOR,
'points' : POINTS,
'comments' : COMMENTS,
'rank' : RANK
}
# Make sure data satisfies requirements
story = validate_story(story)
# self._stories is an array of dictionaries that saves the requested number of stories
self._stories.append(story)
# If required number of stories met, stop parsing
if len(self._stories) >= self._total_posts:
return
def print_stories(self):
"""
Outputs the stories from list of dictionary format to JSON in STDOUT.
"""
json.dump(self._stories, sys.stdout, indent=4)
def get_stories(self):
"""
Returns the scraped stories to the user in a list of dictionary format.
Used for testing purposes.
"""
return self._stories
def get_html(url):
"""
Runs the HTML data through BeautifulSoup to get a BeautifulSoup object, a nested data structure.
"""
response = get_response(url)
if response is not None:
html = BeautifulSoup(response, 'html.parser')
return html
def validate_story(story):
"""
Ensures that all the story data is valid according to the task.
Will return valid data for each field.
"""
story['title'] = story['title'][:256]
if not valid_title(story['title']):
story['title'] = 'Valid title not found'
story['author'] = story['author'][:256]
if not valid_author(story['author']):
story['author'] = 'Valid author not found'
if not valid_uri(story['uri']):
story['uri'] = 'Valid URI not found'
story['comments'] = validate_number(story['comments'])
story['points'] = validate_number(story['points'])
story['rank'] = validate_number(story['rank'])
return story
def valid_title(title):
"""
Ensures that title is non empty string with <= 256 characters
"""
return (len(title) <= 256 and title)
def valid_author(author):
"""
Ensures that author is non empty string and <= 256 characters.
Solved the issue of not finding an author by checking the fetched data with HN username rules.
"""
if(author.find(' ') > -1): #Hacker news username doesnt support whitespace
return False
return (len(author) <= 256 and author)
def valid_uri(url):
"""
To be able to find the scraped stories, we need their URL.
If data is not a valid URL, return False.
"""
if(validators.url(url)):
return True
return False
def validate_number(numString):
"""
Will make sure that the returned number is an integer.
Will strip any non digits from the input and return the first number.
"""
if numString.find('ago') > -1: #If not found, 'time since posted' would replace points for example
return 0
digits = [int(s) for s in numString.split() if s.isdigit()]
if len(digits) > 0:
return digits[0]
return 0
def get_response(url):
"""
Attempts to get the content at 'url' by making an HTTP GET request.
If the content-type of response is some kind of HTML/XML, return the
text content, otherwise return None.
"""
try:
with closing(get(url, stream=True)) as resp:
if is_good_response(resp):
return resp.content
else:
return None
except RequestException as e:
log_error('Error during requests to {0} : {1}'.format(url, str(e)))
return None
def is_good_response(resp):
"""
Returns True if the response seems to be HTML, False otherwise.
"""
content_type = resp.headers['Content-Type'].lower()
return (resp.status_code == 200
and content_type is not None
and content_type.find('html') > -1)
def log_error(e):
"""
Log the errors. Currently just printing them out to user.
"""
print(e)
def validate_input(arg, arg_max):
"""
Validate the user input. Makes sure it is less than or equal to 100 posts.
"""
error_msg = 'Posts cannot exceed {}'.format(arg_max)
if arg > arg_max:
raise argparse.ArgumentTypeError(error_msg)
# Parses the number of posts input from user. Default is 10.
def parse_arguments():
"""
Parses the argument input from the user. Default is 10.
"""
parser = argparse.ArgumentParser()
parser.add_argument('--posts', '-p', metavar='n', type=int, default=1, help='number of posts (max 100)')
args = parser.parse_args()
validate_input(args.posts, MAX_NUM_POSTS)
return args.posts
def main():
"""
If user input is valid, will create a scraper and fetch requests number of posts and print them to the user.
"""
try:
posts = parse_arguments()
hnews_scraper = HackerNewsScraper(posts)
hnews_scraper.scrape_stories()
hnews_scraper.print_stories()
except argparse.ArgumentTypeError as ex:
log_error(ex)
if __name__ == '__main__':
main()
Try This:
Don't forget to import Pandas
story = {
'title' : TITLE,
'uri' : LINK,
'author' : AUTHOR,
'points' : POINTS,
'comments' : COMMENTS,
'rank' : RANK
}
data = list(zip(TITLE, LINK, AUTHOR, POINTS, COMMENTS, RANK))
dt = pd.DataFrame(data, columns = ['title', 'uri', 'author', 'points', 'comments', 'rank'])