Raspberry Pi Webbot

During the second week of summer, I created a web bot crawler for Raspberry Pi because I wanted to learn more and become an expert with programming Raspberry pi.You may ask, what is a web bot crawler? A web bot crawler is a  program that searches for  websites to create an index of data so they can bookmark for easy searching. Google and Bing use webbots to bookmark websites and download the appropriate information to index for people to search. I came across some errors when I was writing the program from the book for the web bot crawler. The first error that I came across was importing images.  The code was not parsing the URL for the websites correctly. To fix this I had to fix the code so it can parse the URL correctly.

Here is the Python program for the web bot crawler:

import mechanize

import time

from bs4 import BeautifulSoup

import re

import urllib

import string

import os

def downloadProcess (html, base, filetype, linkList):

        #”This does the actual file downloading.”

        print(base)

        print(filetype)

        soup = BeautifulSoup(html)

        for link in soup.find_all(‘a’):

          try:

                linkText = str(link.get(‘href’))

                print(linkText)

                if filetype in linkText:

                        slashist = [ i for i, ind in enumerate(linkText) if ind == ‘/’]

                        print(linkText[slashist[0]+1])

                        print(linkText[slashist[1]])

#This was the line where the program was giving errors

                        directoryName = linkText[(slashist[0]+1):slashList[1]+1]

                        print(directoryName)

                        if not os.path.exists(directoryName):

                                os.makedirs(directoryName)

                        print(‘opening image’)

                        image = urllib.URLopener()

                        print(‘opened image’)

                        linkGet = base + linkText

                        filesave = string.lstrip(linkText,”/”)

                        print(linkGet)

                        image.retrieve (linkGet, filesave)

                elif “htm” in linkText:

                        linkList.append(link)

          except Exception, ex:

                  print(ex)

#start = “http://” + raw_input (“Where would you like to star searching?\n”)

start =  raw_input (“Where would you like to star searching?\n”)

filetype = raw_input (“What file type are you looking for?\n”)

#start = “http://www.irrelevantcheetah.com/browserimages.html

#filetype = “jpg”

numSlash = start.count(‘/’)

slashList = [i for i, ind in enumerate(start) if ind == ‘/’]

if (len(slashList) >= 3):

        third = slashList[2]

        base = start[:third]

else:

        base = start

br = mechanize.Browser()

print(start)

r=br.open(start)

html=r.read()

linkList = []

print “Parsing ” + start

downloadProcess(html, base, filetype, linkList)

for leftover in linkList:

        time.sleep(0.1)

        linkText = str(leftover.get(‘href’))

        print “Parsing” + base + linkText

        br = mechanize.Browser()

        r = br.open(base + linkText)

        html = r.read()

        linkList = []

        downloadProcess(html, base, filetype, linkList)

References:

http://www.amazon.com/Learn-Raspberry-Pi-Programming-Python/dp/1430264241/ref=sr_1_1?s=books&ie=UTF8&qid=1435595569&sr=1-1&keywords=learn+raspberry+pi+programming+with+python

Leave a Comment

Your email address will not be published. Required fields are marked *