#Home
|
#Search
|
#IRC(
WEB
/
Client
)
|
#MD5 Cracker
|
#Categories
|
#Links
|
#About
Simple Spider (With BeautifulSoup Library)
#!/usr/bin/env python import sys import urllib2 import urlparse from BeautifulSoup import BeautifulSoup try: root = sys.argv[1] except IndexError: print " Usage: ./crawlzor.py link" print " Example: ./crawlzor.py http://yahoo.com/" exit() linkz = [] crawled = [] errorz = [] parsedRoot = urlparse.urlparse(root) if parsedRoot.port == 80: hostRoot = parsedRoot.netloc[:-3] else: hostRoot = parsedRoot.netloc linkz.append(root) for i in linkz: if i not in crawled: print "[*] Crawling " + i try: src = urllib2.urlopen(i).read() bs = BeautifulSoup(src) for j in bs.findAll('a', {'href':True}): absUrl = urlparse.urljoin(i, j['href']) parsedUrl = urlparse.urlparse(absUrl) if parsedUrl.port == 80: hostUrl = parsedUrl.netloc[:-3] else: hostUrl = parsedUrl.netloc absUrl = urlparse.urlunparse((parsedUrl.scheme, hostUrl, parsedUrl.path, parsedUrl.params, parsedUrl.query, parsedUrl.fragment)) if (parsedUrl.scheme == 'http') & \ ((parsedUrl.netloc.endswith('.' + hostRoot)) | (parsedUrl.netloc == hostRoot)) & \ (absUrl not in linkz): print absUrl linkz.append(absUrl) except: print "[-] Error @ " + i + ".. skipping" errorz.append(i) crawled.append(i)
Back
Send all submissions to nullbyte.israel[at]gmail.com
Copyright © 2009 - 2010 | Queries: 4