import string def printHTMLfile(body,title): ''' create a standard html page with titles, header etc. and add the body (an html box) to that page. File created is title+'.html' ''' fd = open(title+'.html','w') theStr=""" """+title+"""


Last modified: Fri Oct 10 10:09:41 EDT 2008 """ fd.write(theStr) fd.close() def makeHTMLbox(body): ''' make and HTML box that has all the words in it ''' boxStr = """
""" return boxStr % (body) def makeHTMLword(word,cnt,high,low): ''' make a word with a font size to be placed in the box. Font size is scaled between htmlBig and htmlLittle (to be user set). high and low represent the high and low counts in the document. cnt is the cnt of the word ''' htmlBig = 96 htmlLittle = 14 ratio = (cnt-low)/float(high-low) fontsize = htmlBig*ratio + (1-ratio)*htmlLittle fontsize = int(fontsize) wordStr = '%s' return wordStr % (str(fontsize), word) def cleanStr(aStr): aStr = aStr.strip() aStr = aStr.lower() for char in string.punctuation: if char in aStr: aStr = aStr.replace(char,'') return aStr def setDictionary(fields,bDict,iDict,pDict,curDict): if fields[0]=='IFILL:': fields.remove('IFILL:') return iDict elif fields[0]=='PALIN:': fields.remove('PALIN:') return pDict elif fields[0]=='BIDEN:': fields.remove('BIDEN:') return bDict else: return curDict def extractTopX(theDict,num): lst = [(value,key) for key,value in theDict.items()] lst.sort() lst = lst[-num:] lst = [(word,cnt) for cnt,word in lst] return lst fd = open('stop2.txt') stopList = [line.strip() for line in fd] fd.close() palinDict={} bidenDict={} ifillDict={} presentDict={} fd = open('debate.txt') for line in fd: fields = line.strip().split() if not fields: continue presentDict = setDictionary(fields,bidenDict,ifillDict,palinDict,presentDict) for f in fields: f = cleanStr(f) if f not in stopList and f: try: presentDict[f]+=1 except KeyError: presentDict[f]=1 l=extractTopX(bidenDict,40) least = l[0][1] most =l[-1][1] l.sort() body = [makeHTMLword(w,cnt,most,least) for w,cnt in l] body=' '.join(body) body = makeHTMLbox(body) printHTMLfile(body,'biden') l=extractTopX(palinDict,40) least = l[0][1] most =l[-1][1] l.sort() body = [makeHTMLword(w,cnt,most,least) for w,cnt in l] body=' '.join(body) body = makeHTMLbox(body) printHTMLfile(body,'palin') fd.close()