tutorialpoint.org

Blogs

Algorithm for Friendship, Affair, and Love

IIT Patna: A Journey Began

Google page ranking algorithm

Crawling a website

Performance measure of a website

Code of a crawler to fetch links

# to access source code import of any webste
import urllib
#for parsing of HTML code
from bs4 import BeautifulSoup
import urlparse
#virtual browser to support all the activity
import mechanize
# Set the starting point for the spider and initialize
# the a mechanize browser object
url = "http://iitk.ac.in"
br = mechanize.Browser()
# create lists for the urls in que and visited urls
urls = [url]
visited = [url]
# Since the amount of urls in the list is dynamic
# we just let the spider go until some last url didn't
while len(urls)>0:
try:
br.open(urls[0])
urls.pop(0)
for link in br.links():
#for add both link url and base url
newurl = urlparse.urljoin(link.base_url,link.url)
#print newurl
if newurl not in visited and url in newurl:
visited.append(newurl)
urls.append(newurl)
print newurl
except:
print "error"
urls.pop(0)
print visited

< Prev.Page   1   2   3   4   5   6   Next page>