#checkDisciplines.py #written by Cliff Harrison (cliff at cliffordharrison dot com) #Eastern Illinois University #Fall 2014 #This script checks BePress subdiscipline web pages for occurences off the string "Eastern Illinois University" #written/tested using python 2.7 import urllib #this library might be deprecated. Look into urllib2... from time import strftime currentTime = strftime("%Y-%m-%d_%H-%M-%S") # current time for appending to results file name #open the master list (or test list) as a read only file-like object #this is a text file list of URLs that this script will check, one-by-one, for the phrase "Eastern Illinois University" #this text file must be located in the same folder/directory as this script. subList = open("sub-disciplines-master-list.txt","r") # use this line to check all URLs; comment out line below #subList = open("sub-disciplines-TEST-list.txt","r") # use this line for testing a shorter list of URLs; comment out the line above #define a file for output; append current time so results files are not over-written on subsequent runs resultsList = open("pages-with-EIU_" + currentTime + ".txt","w") #define a log file if desired #logFile = open("log-file.txt","w") yesCount = 0 noCount = 0 for line in subList: line.replace("\n","") # remove trailing line breaks page = urllib.urlopen(line) # access URL stored in "line" variable; creates file-like object from URL content pageText = page.read() # creates string-type variable containing page text from the URL. allows easy text searching within page content #logFile.write(strftime("%Y-%m-%d %H:%M:%S") + " " + line) # write each URL to log file along with time of access if "Eastern Illinois University" in pageText: # if the search string is found in the page text... resultsList.write(line) # ...then write the URL to the output file. page.close() # get file-like object out of memory yesCount += 1 # keep track of how many hits are found... else: page.close() noCount += 1 # ...or not found. #Add a summary line to the end of the output file resultsMessage = "'Eastern Illinois University' appeared on " + str(yesCount) + " pages and did not appear on " + str(noCount) + " pages." resultsList.write("\n\n" + resultsMessage) #house keeping subList.close() resultsList.close() #logFile.close()