from bs4 import BeautifulSoup import urllib2 from datetime import datetime, timedelta, date import time import os import random #taken from http://stackoverflow.com/a/26747854/5241853 def readlines_reverse(filename): with open(filename) as qfile: qfile.seek(0, os.SEEK_END) position = qfile.tell() line = '' while position >= 0: qfile.seek(position) next_char = qfile.read(1) if next_char == "\n": yield line[::-1] line = '' else: line += next_char position -= 1 yield line[::-1] #pull in list of subdomains, create array of them subdomains = [line.rstrip('\n') for line in open('subdomains.txt')] keywords = ['edit','proofread','transcri'] yesterday = date.today() - timedelta(days=1) #open the output file for writing filename = "./output/" + str(time.strftime("%Y-%m-%d")) + ".html" f = open(filename,'a') #check on which subdomain it had achieved before failure on last run location = 0 for line in readlines_reverse(filename): if (line.find("
") == 0): lastSubdomain = line[4:(len(line)-5)] location = subdomains.index(lastSubdomain) + 1 break domainString = '.craigslist.org/search/wrg' # for loop over subdomains, initialized to the last subdomain written to for i in xrange(location,len(subdomains)): #print a "header" for the subdomain into our output file (to later on determine scrape progress) f.write("
" + subdomains[i] + "
\n") #try to not get blocked time.sleep(random.randint(9,15)) # on subdomain, pull down the writing gigs page req = urllib2.Request("http://" + subdomains[i] + domainString, headers = {'User-Agent' : "Mozilla/5.0 (X11; U; Linux i686) Gecko/20071127 Firefox/2.0.0.11"}) page = urllib2.urlopen(req).read() soup = BeautifulSoup(page, 'html.parser') #for each link on the gigs page for anchor in soup.findAll("a",class_="hdrlnk"): strTimeStamp = anchor.parent.find('time')['datetime'] strTimeStamp = strTimeStamp[:strTimeStamp.find(' ')] date_obj = datetime.strptime(strTimeStamp,'%Y-%m-%d').date() #if the link is from yesterday if(date_obj == yesterday): #if it is a relative link if (anchor['href'].find(".craigslist.org") == -1): print(anchor['href'].lower()) # for each keyword, does it match? for keyword in keywords: #if matches keyword(s), write it to output file if(anchor.decode_contents().lower().find(keyword) != -1): #modify anchor to include domain & open in new tab, then write it to the file anchor['href'] = "http://" + subdomains[i] + ".craigslist.org" + anchor['href'] anchor['target'] = "_blank" f.write(str(anchor) + "
\n") print(anchor) continue #unfortunately cant break or we'll skip the 'else' statement below else: continue # even if not from yesterday, can still add the subdomain if not a relative link (>NEARBY search results) if (anchor['href'].find(".craigslist.org") != -1): #isolate subdomain, too lazy to wrestle with regex right now newSub = anchor['href'][2:anchor['href'].find(".")] #if subdomain not in list, open, write it to the subdomain file, close. add to subdomain array also if newSub not in subdomains: subdomains.append(newSub) s = open("subdomains.txt",'a') s.write(newSub+"\n") s.close() # close output file f.close()