import
httplib2
import
os
import
re
import
threading
import
urllib
import
urllib.request
from
urllib.parse
import
urlparse, urljoin
from
bs4
import
BeautifulSoup
class
CrawlerSingleton(
object
):
def
__new__(
cls
):
if
not
hasattr
(
cls
,
'instance'
):
cls
.instance
=
super
(CrawlerSingleton,
cls
).__new__(
cls
)
return
cls
.instance
def
navigate_site(max_links
=
5
):
parser_crawlersingleton
=
CrawlerSingleton()
while
parser_crawlersingleton.url_queue:
if
len
(parser_crawlersingleton.visited_url)
=
=
max_links:
return
url
=
parser_crawlersingleton.url_queue.pop()
http
=
httplib2.Http()
try
:
status, response
=
http.request(url)
except
Exception:
continue
parser_crawlersingleton.visited_url.add(url)
print
(url)
bs
=
BeautifulSoup(response,
"html.parser"
)
for
link
in
BeautifulSoup.findAll(bs,
'a'
):
link_url
=
link.get(
'href'
)
if
not
link_url:
continue
parsed
=
urlparse(link_url)
if
parsed.netloc
and
parsed.netloc !
=
parsed_url.netloc:
continue
scheme
=
parsed_url.scheme
netloc
=
parsed.netloc
or
parsed_url.netloc
path
=
parsed.path
link_url
=
scheme
+
'://'
+
netloc
+
path
if
link_url
in
parser_crawlersingleton.visited_url:
continue
parser_crawlersingleton.url_queue
=
[link_url]
+
\
parser_crawlersingleton.url_queue
class
ParallelDownloader(threading.Thread):
def
__init__(
self
, thread_id, name, counter):
threading.Thread.__init__(
self
)
self
.name
=
name
def
run(
self
):
print
(
'Starting thread'
,
self
.name)
download_images(
self
.name)
print
(
'Finished thread'
,
self
.name)
def
download_images(thread_name):
singleton
=
CrawlerSingleton()
while
singleton.visited_url:
url
=
singleton.visited_url.pop()
http
=
httplib2.Http()
print
(thread_name,
'Downloading images from'
, url)
try
:
status, response
=
http.request(url)
except
Exception:
continue
bs
=
BeautifulSoup(response,
"html.parser"
)
images
=
BeautifulSoup.findAll(bs,
'img'
)
for
image
in
images:
src
=
image.get(
'src'
)
src
=
urljoin(url, src)
basename
=
os.path.basename(src)
print
(
'basename:'
, basename)
if
basename !
=
'':
if
src
not
in
singleton.image_downloaded:
singleton.image_downloaded.add(src)
print
(
'Downloading'
, src)
urllib.request.urlretrieve(src, os.path.join(
'images'
, basename))
print
(thread_name,
'finished downloading images from'
, url)
def
main():
crwSingltn
=
CrawlerSingleton()
crwSingltn.url_queue
=
[main_url]
crwSingltn.visited_url
=
set
()
crwSingltn.image_downloaded
=
set
()
navigate_site()
if
not
os.path.exists(
'images'
):
os.makedirs(
'images'
)
thread1
=
ParallelDownloader(
1
,
"Thread-1"
,
1
)
thread2
=
ParallelDownloader(
2
,
"Thread-2"
,
2
)
thread1.start()
thread2.start()
if
__name__
=
=
"__main__"
:
parsed_url
=
urlparse(main_url)
main()