needs the list to crawl and the list crawled.
Basic concept of crawling is here.
start with tocrawl = [seed]
crawled = []
while there more pages tocrawl:
pick a page from tocrawl
add that page to crawled
add all the link target on
this page to tocrawl
return crawled
this is a procedure of get links.
def get_all_links(page);
links = []
while True:
url, endpos = get_next_target(page)
if url:
links.append(url)
page = page[endpos:]
else:
break
return links
starting from the seedpage. The important point is web crawler follow last page of link using python pop.
def crawl_web(seed):
tocrawl = [seed]
crawled = []
while tocrawl:
page = tocrawl.pop()
def union(p,q):
for e in q:
if e not in p:
p.append(e)
def get_all_links(page):
links = []
while True:
url,endpos = get_next_target(page)
if url:
links.append(url)
page = page[endpos:]
else:
break
return links
def crawl_web(seed):
tocrawl = [seed]
crawled = []
while tocrawl:
page = tocrawl.pop()
if page not in crawled:
union(tocrawl, get_all_links(get_page(page)))
crawled.append(page)