So for one of my projects I have to create a AI similar to the pageRank algorithm to rank the importance of html files. The code is in the error below and when I run the code I get this error. I am out of ideas for this one. I have been looking at this code for the past 3 hours and googled a way to fix it. I know that when you call a dict to make sure it works you have to return it as an array, but in the code, I am returning the page as [page]. I need another pair of eyes to look at tit.
Traceback (most recent call last):
File "pagerank.py", line 208, in <module>
main()
File "pagerank.py", line 14, in main
ranks = sample_pagerank(corpus, DAMPING, SAMPLES)
File "pagerank.py", line 119, in sample_pagerank
new_samp_choice = transition_model(corpus, sample, damping_factor)
File "pagerank.py", line 64, in transition_model
num_links = len(corpus([page]))
TypeError: 'dict' object is not callable.
So I looked over stacked overflow and said that in order to call a dict is to print it out as an array. But I have looked at the code a hundred times, I don't know whats wrong with it.
import os
import random
import re
import sys
from collections import Counter
DAMPING = 0.85
SAMPLES = 10000
def main():
if len(sys.argv) != 2:
sys.exit("Usage: python pagerank.py corpus")
corpus = crawl(sys.argv[1])
ranks = sample_pagerank(corpus, DAMPING, SAMPLES)
print(f"PageRank Results from Sampling (n = {SAMPLES})")
for page in sorted(ranks):
print(f" {page}: {ranks[page]:.4f}")
ranks = iterate_pagerank(corpus, DAMPING)
print(f"PageRank Results from Iteration")
for page in sorted(ranks):
print(f" {page}: {ranks[page]:.4f}")
def crawl(directory):
"""
Parse a directory of HTML pages and check for links to other pages.
Return a dictionary where each key is a page, and values are
a list of all other pages in the corpus that are linked to by the page.
"""
pages = dict()
# Extract all links from HTML files
for filename in os.listdir(directory):
if not filename.endswith(".html"):
continue
with open(os.path.join(directory, filename)) as f:
contents = f.read()
links = re.findall(r"<a\s+(?:[^>]*?)href=\"([^\"]*)\"", contents)
pages[filename] = set(links) - {filename}
# Only include links to other pages in the corpus
for filename in pages:
pages[filename] = set(
link for link in pages[filename]
if link in pages
)
return pages
def transition_model(corpus, page, damping_factor):
"""
Return a probability distribution over which page to visit next,
given a current page.
With probability `damping_factor`, choose a link at random
linked to by `page`. With probability `1 - damping_factor`, choose
a link at random chosen from all pages in the corpus.
"""
page_mod = {}
#run the number of files on the corpus
num_files = len(corpus)
#receive number of links from the page that was picked at random
num_links = len(corpus([page]))
if num_links != 0:
#Calculate the probability
randonm_set = (1 - damping_factor)/num_files
#calculating the specific page realted probability
specific_set = (1 - damping_factor)/ num_links
else: #Calculate the probability from all pages
randonm_set = (1 - damping_factor) / num_links
specific_set = 0
#iterate over the files
for file in corpus:
#Checking the page to see if there any other links
if len(corpus[page])== 0:
page_mod[file] = 1 / num_files
else:
if file not in corpus[page]:
page_mod[file] = randonm_set
else:
page_mod[file] = specific_set + randonm_set
if round(sum(page_mod.values()) ,5) != 1:
print(f'ERROR! The probabilites add up from {sum(page_mod.values())}')
return page_mod
def sample_pagerank(corpus, damping_factor, n):
"""
Return PageRank values for each page by sampling `n` pages
according to transition model, starting with a page at random.
Return a dictionary where keys are page names, and values are
their estimated PageRank value (a value between 0 and 1). All
PageRank values should sum to 1.
"""
sample_PR = {}
# Mappinga variable name to sample generated and make it equal to 0
for page in corpus:
sample_PR[page] = 0
sample = None
for iteration in range(n):
if sample == None:
# list of all the choices
choices = list(corpus.keys())
# choose a choice at random
sample = random.choice(choices)
sample_PR[sample] += 1
else:
#Get the probability based of the current sample choice
new_samp_choice = transition_model(corpus, sample, damping_factor)
#List of all choices
choices = list(new_samp_choice.keys())
# Weights for the distribution for each page and ranking up thier importance
weights = [new_samp_choice[key] for key in choices]
# when you run it you the random.choices method will return a list of values
sample = random.choices(choices,weights).pop()
sample_PR[sample] += 1
#Divide the iterations to get an percentage
sample_PR + {key: value/n for key, value in sample_PR.items()}
#Check if the value sadd up to 1
if round(sum(sample_PR.values()), 5) != 1:
print(f'ERROR! The probabilites add up from {sum(page_mod.values())}')
else:
print(
f'sum of the page Rank files: {round(sum(sample_PR.values()),10)}')
return sample_PR
def iterate_pagerank(corpus, damping_factor):
"""
Return PageRank values for each page by iteratively updating
PageRank values until convergence.
Return a dictionary where keys are page names, and values are
their estimated PageRank value (a value between 0 and 1). All
PageRank values should sum to 1.
"""
#Create a dictionary for the iterations
iterate_PR = {}
#The number of pages in the coprus
num_pages = len(corpus)
#Iterate over the copus and assign a number to each page
for page in corpus:
iterate_PR[page] = 1/num_pages
changes = 1
iterations = 1
while changes >= 0.001:
changes = 0
#Copy the current state of the value to make sure it doesn't overide another value
prev_state = iterate_PR.copy()
#Iterate over the pages
for page in iterate_PR:
#Get pparent pages that link to it
parents = [link for link in corpus if page in corpus [link]]
#Add the damping factor/ number of links and create the iteration over the parents as a array
first_eq = (1 - damping_factor)/ num_pages
second_eq = []
if len(parents) != 0:
for parent in parents:
#Start the the number of links from the parent page
num_links = len(corpus[parent])
value = prev_state[parent]/ num_links
second_eq.append(value)
#Start the second list to sum up the values
second = sum(second_eq)
iterate_PR[page] = first_eq + (damping_factor * second)
#Calculating the change of the iteration
new_change = abs(iterate_PR[page] - prev_state[page])
if changes < new_change:
changes = new_change
iterations += 1
dictsum = sum(iterate_PR.values())
iterate_PR = {key: value/dictsum for key, value in iterate_PR.items()}
print(f'\nPageRank value stable after {iterations} iterations.')
print(f' Sum of iterate_pagerank values: {round(sum(iterate_PR.values()),10)}')
return iterate_PR
if __name__ == "__main__":
main()
Related
Does anyone have a recommendation how how I can perform a string search in Github code, identify files matching the string in code and then identify the owner of the code file?
This function allows me to search code, but I can't figure out how to identify the author/owner.
def search_github(keyword):
#rate_limit = g.get_rate_limit()
#rate = rate_limit.search
#if rate.remaining == 0:
# print(f'You have 0/{rate.limit} API calls remaining.
Reset time: {rate.reset}')
# return
#else:
# print(f'You have {rate.remaining}/{rate.limit} API
calls remaining')
query = f'"{keyword}"'
result = g.search_code(query, order='desc')
max_size = 100
print(f'Found {result.totalCount} file(s)')
if result.totalCount > max_size:
result = result[:max_size]
for file in result:
print(f'{file.download_url}')
The idea of the code is to add to existent playlist unwatched EPs by index order, ep 1 Show X, ep 1 Show Z, regardless of air date:
from plexapi.server import PlexServer
baseurl = 'http://0.0.0.0:0000/'
token = '0000000000000'
plex = PlexServer(baseurl, token)
episode = 0
first_ep_name = []
for x in plex.library.section('Anime').search(unwatched=True):
try:
for y in plex.library.section('Anime').get(x.title).episodes()[episode]:
if plex.library.section('Anime').get(x.title).episodes()[episode].isWatched:
episode +=1
first_ep_name.append(y)
else:
episode = 0
first_ep_name.append(y)
except:
continue
plex.playlist('Anime Playlist').addItems(first_ep_name)
But when I run it, it will always add watched EPs but if I debug the code in Thoni IDE it seems that is doing its purpose so I am not sure whats wrong with that code.
Any ideas?
Im thinking that the error might be here:
plex.playlist('Anime Playlist').addItems(first_ep_name)
but according to the documentation addItems should be a list but my list "first_ep_name " its already appending unwatched episodes in the correct order, in theory addItems should recognize the specific episode and not only the series name but I am not sure anymore.
is somebody out there is having the same issue with plexapi I was able to find a way to get this project working properly:
from plexapi.server import PlexServer
baseurl = 'insert plex url here'
token = 'plex token here'
plex = PlexServer(baseurl, token)
anime_plex = []
scrapped_playlist = []
for x in plex.library.section('Anime').search(unwatched=True):
anime_plex.append(x)
while len(anime_plex) >0:
episode_list = []
for y in plex.library.section('Anime').get(anime_plex[0].title).episodes():
episode_list.append(y)
ep_checker = True
while ep_checker:
if episode_list[0].isWatched:
episode_list.pop(0)
else:
scrapped_playlist.append(episode_list[0])
episode_list.clear()
ep_checker = False
anime_plex.pop(0)
# plex.playlist('Anime Playlist').addItems(scrapped_playlist)
plex.playlist('Anime Playlist').delete()
plex.createPlaylist('Anime Playlist', section='Anime', items= scrapped_playlist)
Basically, what I am doing with that code I am looping through each anime series I have and if EP # X is watched then pop from the list until it finds a boolean FALSE then that will append into an empty list that later I will use for creating/adding to playlist.
The last lines of the code can be commented on for whatever purpose, creating the playlist anime or adding items.
I am getting different result when I use Bio Entrez to search. For example when I search on browser using query "covid side effect" I get 344 result where as I get only 92 when I use Bio Entrez. This is the code I was using.
from Bio import Entrez
Entrez.email = "Your.Name.Here#example.org"
handle = Entrez.esearch(db="pubmed", retmax=40, term="covid side effect", idtype="acc")
record = Entrez.read(handle)
handle.close()
print(record['Count'])
I was hoping if someone could help me with this discrepancy.
For some reason everyone seemed to have same issue whether it's R api or Python API. I have found a work around to get the same result. It is slow but it gets job done. If your result is less than 10k you could probably use Selenium to get the pubmedid. Else, we can scrape the data using code below. I hope this will help someone in future.
import requests
# # Custom Date Range
# req = requests.get("https://pubmed.ncbi.nlm.nih.gov/?term=covid&filter=dates.2009/01/01-2020/03/01&format=pmid&sort=pubdate&size=200&page={}".format(i))
# # Custom Year Range
# req = requests.get("https://pubmed.ncbi.nlm.nih.gov/?term=covid&filter=years.2010-2019&format=pmid&sort=pubdate&size=200&page={}".format(i))
# #Relative Date
# req = requests.get("https://pubmed.ncbi.nlm.nih.gov/?term=covid&filter=datesearch.y_1&format=pmid&sort=pubdate&size=200&page={}".format(i))
# # filter language
# # &filter=lang.english
# # filter human
# #&filter=hum_ani.humans
# Systematic Review
#&filter=pubt.systematicreview
# Case Reports
# &filter=pubt.casereports
# Age
# &filter=age.newborn
search = "covid lungs"
# search_list = "+".join(search.split(' '))
def id_retriever(search_string):
string = "+".join(search_string.split(' '))
result = []
old_result = len(result)
for page in range(1,10000000):
req = requests.get("https://pubmed.ncbi.nlm.nih.gov/?term={string}&format=pmid&sort=pubdate&size=200&page={page}".format(page=page,string=string))
for j in req.iter_lines():
decoded = j.decode("utf-8").strip(" ")
length = len(decoded)
if "log_displayeduids" in decoded and length > 46:
data = (str(j).split('"')[-2].split(","))
result = result + data
data = []
new_result = len(result)
if new_result != old_result:
old_result = new_result
else:
break
return result
ids=id_retriever(search)
len(ids)
Working on getting some wave heights from websites and my code fails when the wave heights get into the double digit range.
Ex: Currently the code would scrape a 12 from the site as '1' and '2' separately, not '12'.
#Author: David Owens
#File name: soupScraper.py
#Description: html scraper that takes surf reports from various websites
import csv
import requests
from bs4 import BeautifulSoup
NUM_SITES = 2
reportsFinal = []
###################### SURFLINE URL STRINGS AND TAG ###########################
slRootUrl = 'http://www.surfline.com/surf-report/'
slSunsetCliffs = 'sunset-cliffs-southern-california_4254/'
slScrippsUrl = 'scripps-southern-california_4246/'
slBlacksUrl = 'blacks-southern-california_4245/'
slCardiffUrl = 'cardiff-southern-california_4786/'
slTagText = 'observed-wave-range'
slTag = 'id'
#list of surfline URL endings
slUrls = [slSunsetCliffs, slScrippsUrl, slBlacksUrl]
###############################################################################
#################### MAGICSEAWEED URL STRINGS AND TAG #########################
msRootUrl = 'http://magicseaweed.com/'
msSunsetCliffs = 'Sunset-Cliffs-Surf-Report/4211/'
msScrippsUrl = 'Scripps-Pier-La-Jolla-Surf-Report/296/'
msBlacksUrl = 'Torrey-Pines-Blacks-Beach-Surf-Report/295/'
msTagText = 'rating-text'
msTag = 'li'
#list of magicseaweed URL endings
msUrls = [msSunsetCliffs, msScrippsUrl, msBlacksUrl]
###############################################################################
'''
This class represents a surf break. It contains all wave, wind, & tide data
associated with that break relevant to the website
'''
class surfBreak:
def __init__(self, name,low, high, wind, tide):
self.name = name
self.low = low
self.high = high
self.wind = wind
self.tide = tide
#toString method
def __str__(self):
return '{0}: Wave height: {1}-{2} Wind: {3} Tide: {4}'.format(self.name,
self.low, self.high, self.wind, self.tide)
#END CLASS
'''
This returns the proper attribute from the surf report sites
'''
def reportTagFilter(tag):
return (tag.has_attr('class') and 'rating-text' in tag['class']) \
or (tag.has_attr('id') and tag['id'] == 'observed-wave-range')
#END METHOD
'''
This method checks if the parameter is of type int
'''
def representsInt(s):
try:
int(s)
return True
except ValueError:
return False
#END METHOD
'''
This method extracts all ints from a list of reports
reports: The list of surf reports from a single website
returns: reportNums - A list of ints of the wave heights
'''
def extractInts(reports):
print reports
reportNums = []
afterDash = False
num = 0
tens = 0
ones = 0
#extract all ints from the reports and ditch the rest
for report in reports:
for char in report:
if representsInt(char) == True:
num = int(char)
reportNums.append(num)
else:
afterDash = True
return reportNums
#END METHOD
'''
This method iterates through a list of urls and extracts the surf report from
the webpage dependent upon its tag location
rootUrl: The root url of each surf website
urlList: A list of specific urls to be appended to the root url for each
break
tag: the html tag where the actual report lives on the page
returns: a list of strings of each breaks surf report
'''
def extractReports(rootUrl, urlList, tag, tagText):
#empty list to hold reports
reports = []
reportNums = []
index = 0
#loop thru URLs
for url in urlList:
try:
index += 1
#request page
request = requests.get(rootUrl + url)
#turn into soup
soup = BeautifulSoup(request.content, 'lxml')
#get the tag where surflines report lives
reportTag = soup.findAll(reportTagFilter)[0]
reports.append(reportTag.text.strip())
#notify if fail
except:
print 'scrape failure at URL ', index
pass
reportNums = extractInts(reports)
return reportNums
#END METHOD
'''
This method calculates the average of the wave heights
'''
def calcAverages(reportList):
#empty list to hold averages
finalAverages = []
listIndex = 0
waveIndex = 0
#loop thru list of reports to calc each breaks ave low and high
for x in range(0, 6):
#get low ave
average = (reportList[listIndex][waveIndex]
+ reportList[listIndex+1][waveIndex]) / NUM_SITES
finalAverages.append(average)
waveIndex += 1
return finalAverages
#END METHOD
slReports = extractReports(slRootUrl, slUrls, slTag, slTagText)
msReports = extractReports(msRootUrl, msUrls, msTag, msTagText)
reportsFinal.append(slReports)
reportsFinal.append(msReports)
print 'Surfline: ', slReports
print 'Magicseaweed: ', msReports
You are not actually extracting integers, but floats, it seems, since the values in reports are something like ['0.3-0.6 m']. Right now you are just going through every single character and converting them to int one by one or discarding. So no wonder that you will get only single-digit numbers.
One (arguably) simple way to extract those numbers from that string is with regexp:
import re
FLOATEXPR = re.compile("(\d+\.\d)-(\d+\.\d) {0,1}m")
def extractFloats(reports):
reportNums = []
for report in reports:
groups = re.match(FLOATEXPR, report).groups()
for group in groups:
reportNums.append(float(group))
return reportNums
This expression would match your floats and return them as a list.
In detail, the expression will match anything that has at least one digit before a '.', and one digit after it, a '-' between, another float sequence and ending with 'm' or ' m'. Then it groups the parts representing floats to a tuple. For example that ['12.0m-3.0m'] would return [12.0, 3.0]. If you expect it to have more digits after the floating point, you can add an extra '+' after the second 'd':s in the expression.
I try to use extjs with django, i started extjs with php. for create a paginate grid i used to get total count of the data and get the start and limit value. In django, the pagination does not work. what am i forgot? is it my query? i use postgresql. this is my code. i
if request.POST['task'] == 'OK':
pers = Plante.objects.all().values('id','name','year')
nbrows = len(pers)
if request.POST['start']:
start = request.POST['start']
else:
start = request.GET['start']
if request.POST['limit']:
end = request.POST['limit']
else:
end = request.GET['limit']
pers = Plante.objects.all().values('id','name','year')[start:end]
start = int(request.POST.get('start') or request.GET.get('start'))
limit = int(request.POST.get('limit') or request.GET.get('limit'))
pers = Plante.objects.all().values('id','name','year')[start:start+limit]
I know it's quite late but here's a way how you can achieve it using the "start" & "limit" pagination params sent by EXTJS.
def fetchRecords(self, params):
totalCount = 0
pageNumber = 1
records = []
ids = []
#Instanciate your query object
query = Q()
#Not really relevant for this case but in case you have any filter criteria params then put them here
if(params.get("searchStartDate")):
startDate = datetime.strptime(params.get("searchStartDate"), '%Y-%m-%d').date()
query &= Q(date_created__gte=startDate)
if(params.get("searchEndDate")):
endDate = datetime.strptime(params.get("searchEndDate"), '%Y-%m-%d').date()
query &= Q(date_created__lte=endDate)
# Get the total count, EXT JS Grids need the total count value to be able to paginate
totalCount = YourModel.objects.filter(query).count()
#Get the primary keys, we do this because we don't want to get all the objects onto memory. The paginator doesn't
#Optimize the fetched data. If your table has millions of records and you load all the record objects to mem, the
#execution might be quite slow
your_model_ids_list = YourModel.objects.filter(query).order_by("-id").only('id')
#Compute the page number based on the pagination "start" & "limit" params sent by EXT grid
if(int(params.get("start")) != 0 ):
pageNumber = (int(params.get("start")) / int(params.get("limit"))) + 1
#Instanciate the paginator object with the unique id's list matching your filter criteria & limit
paginator = Paginator(your_model_ids_list, int(params.get("limit")))
#Get the records that fall on the particular page number that we computed above
recordIds = paginator.page(pageNumber)
#Iterate through the record IDs and place them in an array list
for recordId in recordIds.object_list:
ids.append(recordId.id)
#Now fetch the records from your model based on the unique ids that fall on the particular page fetched
#above
result = YourModel.objects.filter(Q(pk__in=ids)).order_by("-id")
#Formulate your response object and return the data
return {'totalCount': totalCount, 'records': result}