Reverend-0.4/0000755000175000017500000000000011304542342013041 5ustar exarkunexarkunReverend-0.4/examples/0000755000175000017500000000000011304542342014657 5ustar exarkunexarkunReverend-0.4/examples/emailtrainer.py0000755000175000017500000000354310653432314017720 0ustar exarkunexarkun#!/usr/bin/python # This module is part of the Divmod project and is Copyright 2003 Amir Bakhtiar: # amir@divmod.org. This is free software; you can redistribute it and/or # modify it under the terms of version 2.1 of the GNU Lesser General Public # License as published by the Free Software Foundation. # from email.Message import Message import email import rfc822 class EmailItem(Message): def summary(self): return { 'From': self.sender(), 'Subject':self.get('subject',''), } def sender(self): fromHeader = self['from'] or '"Nobody" ' hdrs = rfc822.AddressList(fromHeader).addresslist for dispname, addr in hdrs: dispname = dispname.strip().strip('"') addr = addr.strip() if dispname == '': dispname = addr return dispname def columnDefs(self): return [('From', 20), ('Subject', 30)] columnDefs = classmethod(columnDefs) def fromFile(self, fp): try: msg = email.message_from_file(fp, self) except email.Errors.MessageParseError: print 'bad message' return None return msg fromFile = classmethod(fromFile) def runTrainer(): from reverend.ui.trainer import Trainer from Tkinter import Tk from reverend.guessers.email import EmailClassifier root = Tk() root.title('Reverend Trainer') root.minsize(width=300, height=300) #root.maxsize(width=600, height=600) guesser = EmailClassifier() display = Trainer(root, guesser=guesser, itemClass=EmailItem) root.mainloop() def runTester(): from reverend.ui.tester import DirectoryExam de = DirectoryExam('spam', 'Spam', EmailItem) for m, ans in de: print m['from'], ans if __name__ == "__main__": runTrainer() #runTester() Reverend-0.4/examples/readme.txt0000644000175000017500000000256210653432314016665 0ustar exarkunexarkunThis brief readme is designed to help you get started with using the Reverend training and testing UI. This is how I use the trainer. I first prepare a couple of directories full of email. One will have a mix of all kinds of email that I want to classify and one for testing that is, say, containing only spam files. I type: python emailtrainer.py I click on the 'New Pool' button and create a pool for each category or bucket that I want to classify the data into. e.g. 'Clean' and 'Spam'. I use the radio buttons to classify the emails. I page back and forth to make sure that new training does not undo old training. Once I am happy with the training. I click 'Save' to save the Reverend data. I can load it later and continue training. When I want to test, I load the Reverend data using the 'Load' button. I then click on the 'Testing' button on the left. I click 'Run Test' which brings up the first of 2 dialogs, asking me to select the test data, e.g. my directory full of spam. The next dialog asks for the correct answer to this set of messages. I type in 'Spam' (case must match your pool name). I have lots of improvements in mind from training reinforcement to better testing and analysis. The trainer is designed to be data-agnostic. Look at example/emailtrainer.py to see how you can simply wrap your domain objects and make them place nice with the UI. Enjoy, -A- Reverend-0.4/reverend/0000755000175000017500000000000011304542342014653 5ustar exarkunexarkunReverend-0.4/reverend/guessers/0000755000175000017500000000000011304542342016513 5ustar exarkunexarkunReverend-0.4/reverend/guessers/__init__.py0000644000175000017500000000000010327506107020615 0ustar exarkunexarkunReverend-0.4/reverend/guessers/email.py0000755000175000017500000000626111223474266020175 0ustar exarkunexarkun# This module is part of the Divmod project and is Copyright 2003 Amir Bakhtiar: # amir@divmod.org. This is free software; you can redistribute it and/or # modify it under the terms of version 2.1 of the GNU Lesser General Public # License as published by the Free Software Foundation. # from rfc822 import AddressList from reverend.thomas import Bayes class EmailClassifier(Bayes): def getTokens(self, msg): # Overide from parent # This should return a list of strings # which will be used as the key into # the table of token counts tokens = self.getHeaderTokens(msg) tokens += self.getBodyTokens(msg) # Get some tokens that are generated from the # header and the structure tokens += self.getMetaTokens(msg) return tokens def getBodyTokens(self, msg): text = self.getTextPlain(msg) if text is None: text = '' tl = list(self._tokenizer.tokenize(text)) return tl def getHeaderTokens(self, msg): subj = msg.get('subject','nosubject') text = subj + ' ' text += msg.get('from','fromnoone') + ' ' text += msg.get('to','tonoone') + ' ' text += msg.get('cc','ccnoone') + ' ' tl = list(self._tokenizer.tokenize(text)) return tl def getTextPlain(self, msg): for part in msg.walk(): typ = part.get_content_type() if typ and typ.lower() == "text/plain": text = part.get_payload(decode=True) return text return None def getTextHtml(self, msg): for part in msg.walk(): typ = part.get_content_type() if typ and typ.lower() == "text/html": text = part.get_payload(decode=False) return text return None def getMetaTokens(self, msg): r = [] for f in ['Content-type', 'X-Priority', 'X-Mailer', 'content-transfer-encoding', 'X-MSMail-Priority']: r.append(f +':' + msg.get(f, 'None')) text = self.getTextPlain(msg) html = self.getTextHtml(msg) for stem, part in zip(['text','html'],[text,html]): if part is None: r.append(stem + '_None') continue else: r.append(stem + '_True') l = len(part.split()) if l is 0: a = 'zero' r.append(stem + a) if l > 10000: a = 'more_than_10000' r.append(stem + a) if l > 1000: a = 'more_than_1000' r.append(stem + a) if l > 100: a = 'more_than_100' r.append(stem + a) t = msg.get('to','') at = AddressList(t).addresslist c = msg.get('cc','') ac = AddressList(c).addresslist if at > 5: r.append('to_more_than_5') if at > 10: r.append('to_more_than_10') if ac > 5: r.append('cc_more_than_5') if ac > 10: r.append('cc_more_than_10') return r Reverend-0.4/reverend/ui/0000755000175000017500000000000011304542342015270 5ustar exarkunexarkunReverend-0.4/reverend/ui/__init__.py0000644000175000017500000000000010327506107017372 0ustar exarkunexarkunReverend-0.4/reverend/ui/tester.py0000644000175000017500000001240610327506107017156 0ustar exarkunexarkun# This module is part of the Divmod project and is Copyright 2003 Amir Bakhtiar: # amir@divmod.org. This is free software; you can redistribute it and/or # modify it under the terms of version 2.1 of the GNU Lesser General Public # License as published by the Free Software Foundation. # from __future__ import generators from Tkinter import * import tkFileDialog import tkSimpleDialog import tkMessageBox import os import time class TestView(Frame): def __init__(self, parent=None, guesser=None, app=None): Frame.__init__(self, parent) self.pack() self.guesser = guesser self.app = app self.size = 300 self.setupViews() def setupViews(self): line = Frame(self, relief=RAISED, borderwidth=1) line.pack(side=TOP, padx=2, pady=1) colHeadings = [('Guesses', 8), ('Right', 8), ('Wrong', 8), ('Accuracy %', 10)] currCol = 0 for cHdr, width in colHeadings: l = Label(line, text=cHdr, width=width, bg='lightblue') l.grid(row=0, column=currCol) currCol += 1 line = Frame(self) line.pack(fill=X) iGuess = IntVar() iRight = IntVar() iWrong = IntVar() iAcc = IntVar() self.model = (iGuess, iRight, iWrong, iAcc) l = Label(line, textvariable=iGuess, anchor=E, width=8, relief=SUNKEN) l.grid(row=0, column=0) l = Label(line, textvariable=iRight, anchor=E, width=8, relief=SUNKEN) l.grid(row=0, column=1) l = Label(line, textvariable=iWrong, anchor=E, width=8, relief=SUNKEN) l.grid(row=0, column=2) l = Label(line, textvariable=iAcc, anchor=E, width=8, relief=SUNKEN) l.grid(row=0, column=3) bp = Button(self, text="Run Test", command=self.runTest) bp.pack(side=BOTTOM) canvas = Canvas(self, width=self.size, height=self.size, bg='lightyellow') canvas.pack(expand=YES, fill=BOTH, side=BOTTOM) self.canvas = canvas ## slid = Scale(self, label='Wrong', variable=iWrong, to=400, orient=HORIZONTAL, bg='red') ## slid.pack(side=BOTTOM) ## slid = Scale(self, label='Right', variable=iRight, to=400, orient=HORIZONTAL, bg='green') ## slid.pack(side=BOTTOM) def runTest(self): # TODO - This is nasty re-write if len(self.guesser) == 0: tkMessageBox.showwarning('Underprepared for examination!', 'Your guesser has had no training. Please train and retry.') return path = tkFileDialog.askdirectory() if not path: return answer = tkSimpleDialog.askstring('Which Pool do these items belong to?', 'Pool name?', parent=self.app) if not answer: return if answer not in self.guesser.pools: return de = DirectoryExam(path, answer, self.app.itemClass) testCount = len(de) scale = self.calcScale(testCount) x = 0 y = 0 cumTime = 0 iGuess, iRight, iWrong, iAcc = self.model for m, ans in de: then = time.time() g = self.guesser.guess(m) cumTime += time.time() - then if g: g = g[0][0] iGuess.set(iGuess.get()+1) if g == ans: col = 'green' iRight.set(iRight.get()+1) else: col = 'red' iWrong.set(iWrong.get()+1) iAcc.set(round(100 * iRight.get()/float(iGuess.get()), 3)) # Plot squares self.canvas.create_rectangle(x*scale,y*scale,(x+1)*scale,(y+1)*scale,fill=col) if not divmod(iGuess.get(),(int(self.size/scale)))[1]: # wrap x = 0 y += 1 else: x += 1 self.update_idletasks() guesses = iGuess.get() self.app.status.log('%r guesses in %.2f seconds. Avg: %.2f/sec.' % (guesses, cumTime, round(guesses/cumTime, 2))) def calcScale(self, testCount): import math scale = int(self.size/(math.sqrt(testCount)+1)) return scale class DirectoryExam(object): """Creates a iterator that returns a pair at a time. (Item, correctAnswer). This Exam creates items from a directory and uses the same answer for each. """ def __init__(self, path, answer, itemClass): self.path = path self.answer = answer self.itemClass = itemClass def __iter__(self): files = os.listdir(self.path) for file in files: fp = open(os.path.join(self.path, file), 'rb') try: item = self.itemClass.fromFile(fp) finally: fp.close() if item is None: continue yield (item, self.answer) def __len__(self): files = os.listdir(self.path) return len(files) Reverend-0.4/reverend/ui/trainer.py0000755000175000017500000003124710327506107017323 0ustar exarkunexarkun# This module is part of the Divmod project and is Copyright 2003 Amir Bakhtiar: # amir@divmod.org. This is free software; you can redistribute it and/or # modify it under the terms of version 2.1 of the GNU Lesser General Public # License as published by the Free Software Foundation. # from Tkinter import * import tkFileDialog import tkSimpleDialog import tkMessageBox import os from util import Command, StatusBar, Notebook from tester import TestView class PoolView(Frame): def __init__(self, master=None, guesser=None, app=None): Frame.__init__(self, master, bg='lightblue3') self.pack() self.listView = Frame(self) self.listView.pack() bp = Button(self, text="New Pool", command=self.newPool) bp.pack(side=LEFT, anchor=SE) self.addLoadSave() self.columnHeadings() self.model = {} self.guesser = guesser self.app = app self.reload() def reload(self): self.listView.destroy() self.listView = Frame(self) self.listView.pack() for pool in self.guesser.poolNames(): self.addPool(self.guesser.pools[pool]) self.addPool(self.guesser.corpus, 'Total') def upload(self): pass def addLoadSave(self): frame = Frame(self) frame.pack(side=RIGHT) bp = Button(frame, text="Upload", command=self.upload, state=DISABLED) bp.pack(side=BOTTOM, fill=X) bp = Button(frame, text="Save", command=self.save) bp.pack(side=BOTTOM, fill=X) bp = Button(frame, text="Load", command=self.load) bp.pack(side=BOTTOM, fill=X) def addPool(self, pool, name=None): col=None tTok = IntVar() train = IntVar() line = Frame(self.listView) line.pack() if name is None: name = pool.name idx = self.guesser.poolNames().index(name) col = self.defaultColours()[idx] l = Label(line, text=name, anchor=W, width=10) l.grid(row=0, column=0) colourStripe = Label(line, text=' ', width=1, bg=col, anchor=W, relief=GROOVE) colourStripe.grid(row=0, column=1) train = IntVar() train.set(pool.trainCount) l = Label(line, textvariable=train, anchor=E, width=10, relief=SUNKEN) l.grid(row=0, column=2) uTok = IntVar() uTok.set(len(pool)) l = Label(line, textvariable=uTok, anchor=E, width=12, relief=SUNKEN) l.grid(row=0, column=3) tTok = IntVar() tTok.set(pool.tokenCount) l = Label(line, textvariable=tTok, anchor=E, width=10, relief=SUNKEN) l.grid(row=0, column=4) self.model[name]=(pool, uTok, tTok, train) def refresh(self): for pool, ut, tt, train in self.model.values(): ut.set(len(pool)) tt.set(pool.tokenCount) train.set(pool.trainCount) def save(self): path = tkFileDialog.asksaveasfilename() if not path: return self.guesser.save(path) self.app.dirty = False def load(self): path = tkFileDialog.askopenfilename() if not path: return self.guesser.load(path) self.reload() self.app.dirty = False def newPool(self): p = tkSimpleDialog.askstring('Create Pool', 'Name for new pool?') if not p: return if p in self.guesser.pools: tkMessageBox.showwarning('Bad pool name!', 'Pool %s already exists.' % p) self.guesser.newPool(p) self.reload() self.app.poolAdded() self.app.status.log('New pool created: %s.' % p, clear=3) def defaultColours(self): return ['green', 'yellow', 'lightblue', 'red', 'blue', 'orange', 'purple', 'pink'] def columnHeadings(self): # FIXME factor out and generalize title = Label(self, text='Pools', relief=RAISED, borderwidth=1) title.pack(side=TOP, fill=X) msgLine = Frame(self, relief=RAISED, borderwidth=1) msgLine.pack(side=TOP) currCol = 0 colHeadings = [('Name', 10), ('', 1), ('Trained', 10), ('Unique Tokens', 12), ('Tokens', 10)] for cHdr, width in colHeadings: l = Label(msgLine, text=cHdr, width=width, bg='lightblue') l.grid(row=0, column=currCol) currCol += 1 class Trainer(Frame): def __init__(self, parent, guesser=None, itemClass=None): self.status = StatusBar(parent) self.status.pack(side=BOTTOM, fill=X) Frame.__init__(self, parent) self.pack(side=TOP, fill=BOTH) self.itemsPerPage = 20 self.rows = [] for i in range(self.itemsPerPage): self.rows.append(ItemRow()) self.items = [] self.files = [] self.cursor = 0 self.dirty = False if guesser is None: from reverend.thomas import Bayes self.guesser = Bayes() else: self.guesser = guesser if itemClass is None: self.itemClass = TextItem else: self.itemClass = itemClass for row in self.rows: row.summary.set('foo') self.initViews() def initViews(self): self.nb = Notebook(self) ## frame1 = Frame(self.nb()) ## self.poolView = PoolView(frame1, guesser=self.guesser, app=self) ## self.poolView.pack(side=TOP) frame2 = Frame(self.nb()) self.poolView = PoolView(frame2, guesser=self.guesser, app=self) self.poolView.pack(side=TOP) self.listView = Canvas(frame2, relief=GROOVE) self.listView.pack(padx=3) bn = Button(self.listView, text="Load training", command=self.loadCorpus) bn.pack(side=RIGHT, anchor=NE, fill=X) self.columnHeadings() self.addNextPrev() frame3 = Frame(self.nb()) self.testView = TestView(frame3, guesser=self.guesser, app=self) self.testView.pack() frame4 = Frame(self.nb()) bp = Button(frame4, text="Quit", command=self.quitNow) bp.pack(side=BOTTOM) #self.nb.add_screen(frame1, 'Reverend') self.nb.add_screen(frame2, 'Training') self.nb.add_screen(frame3, 'Testing') self.nb.add_screen(frame4, 'Quit') def addNextPrev(self): npFrame = Frame(self.listView) npFrame.pack(side=BOTTOM, fill=X) bn = Button(npFrame, text="Prev Page", command=self.prevPage) bn.grid(row=0, column=0) bn = Button(npFrame, text="Next Page", command=self.nextPage) bn.grid(row=0, column=1) def loadCorpus(self): path = tkFileDialog.askdirectory() if not path: return self.loadFileList(path) self.displayItems() self.displayRows() def bulkTest(self): dirs = [] for pool in self.guesser.poolNames(): path = tkFileDialog.askdirectory() dirs.append((pool, path)) for pool, path in dirs: print pool, path def displayList(self): for item in self.items: self.itemRow(item) def displayRows(self): for row in self.rows: self.displayRow(row) def loadFileList(self, path): listing = os.listdir(path) self.files = [os.path.join(path, file) for file in listing] self.cursor = 0 def prevPage(self): self.cursor = max(0, self.cursor - self.itemsPerPage) self.displayItems() def nextPage(self): self.cursor = min(len(self.files), self.cursor + self.itemsPerPage) self.displayItems() def displayItems(self): theseFiles = self.files[self.cursor:self.cursor + self.itemsPerPage] items = [] for file, row in zip(theseFiles, self.rows): fp = open(file, 'rb') try: item = self.itemClass.fromFile(fp) finally: fp.close() if item is None: continue items.append(item) guesses = self.guesser.guess(item) summary = item.summary() cols = item.columnDefs() s = '' for c, ignore in cols: s += summary[c] + ' ' row.initialize(item, s, guesses, self.guesser.poolNames()) self.items = items def quitNow(self): if self.dirty: if tkMessageBox.askyesno("You have unsaved changes!", "Quit without saving?"): self.quit() self.quit() def columnHeadings(self): # FIXME - Something better for columns and rows in general line = Frame(self.listView, relief=RAISED, borderwidth=1) line.pack(side=TOP, padx=2, pady=1) colHeadings = self.itemClass.columnDefs() currCol = 0 for cHdr, width in colHeadings: l = Label(line, text=cHdr, width=width, bg='lightblue') l.grid(row=0, column=currCol) currCol += 1 line = Frame(self) line.pack(fill=X) def training(self, row): sel = row.selection.get() self.guesser.train(sel, row.original) row.current = sel self.guessAll() def guessAll(self): self.poolView.refresh() pools = self.guesser.poolNames() for row in self.rows: row.setGuess(self.guesser.guess(row.original), pools) def displayRow(self, row, bgc=None): # UGH - REWRITE! line = Frame(self.listView, bg=bgc) line.pack(pady=1) row.line = line self.insertRadios(row) Label(line, text=row.summary.get(), textvariable=row.summary, width=60, bg=bgc, anchor=W).grid(row=0, column=2) #Label(line, text=row.guess, width=7, bg=bgc, anchor=W).grid(row=0, column=1) colourStripe = Label(line, text=' ', width=1, bg=bgc, anchor=W, relief=GROOVE) colourStripe.grid(row=0, column=1) line.colourStripe = colourStripe pools = self.guesser.poolNames() row.refreshColour(pools) def poolAdded(self): if not self.items: return pools = self.guesser.poolNames() for row in self.rows: for r in row.radios: r.destroy() self.insertRadios(row) row.refreshColour(pools) self.dirty = True def insertRadios(self, row): radioFrame = Frame(row.line) radioFrame.grid(row=0, column=0) currCol = 0 radios = [] v = row.selection ci = 0 colours = row.defaultColours() pools = self.guesser.poolNames() for pool in pools: rb = Radiobutton(radioFrame, text=pool, variable=v, value=pool, command=Command(self.training, row), bg=None) rb.grid(row=0, column=currCol) radios.append(rb) currCol += 1 ci += 1 row.radios = radios class TextItem(object): def __init__(self, text): self.text = text def summary(self): return {'Text': self.text} def columnNames(self): return ['Text'] def lower(self): return self.text.lower() def fromFile(self, fp): """Return the first line of the file. """ ti = self(fp.readline()) return ti fromFile = classmethod(fromFile) class ItemRow(object): def __init__(self, orig=None): self.line = None self.radios = [] self.original = orig self.current = '' self.guess = [] self.summary = StringVar() self.selection = StringVar() def initialize(self, item=None, summary='', guess=None, pools=[]): self.selection.set('') self.original = item self.summary.set(summary) self.setGuess(guess, pools) def setGuess(self, guess, pools): if not guess: guess = [['']] self.guess = guess self.selection.set(self.bestGuess()) self.current = self.bestGuess() self.refreshColour(pools) def refreshColour(self, pools): col = None if self.guess[0][0] in pools: idx = pools.index(self.guess[0][0]) col = self.defaultColours()[idx] if self.line: self.line.colourStripe.config(bg=col) def __repr__(self): return self.original.__repr__() def defaultColours(self): return ['green', 'yellow', 'lightblue', 'red', 'blue', 'orange', 'purple', 'pink'] def bestGuess(self): if self.guess: return self.guess[0][0] else: return None if __name__ == "__main__": root = Tk() root.title('Reverend Trainer') root.minsize(width=300, height=300) #root.maxsize(width=600, height=600) display = Trainer(root) root.mainloop() Reverend-0.4/reverend/ui/util.py0000644000175000017500000000603110327506107016622 0ustar exarkunexarkun# This module is part of the Divmod project and is Copyright 2003 Amir Bakhtiar: # amir@divmod.org. This is free software; you can redistribute it and/or # modify it under the terms of version 2.1 of the GNU Lesser General Public # License as published by the Free Software Foundation. # from Tkinter import * class StatusBar(Frame): """Courtesy of Fredrik Lundh. """ def __init__(self, master): Frame.__init__(self, master) self.label = Label(self, bd=1, relief=SUNKEN, anchor=W) self.label.pack(fill=X) def set(self, format, *args): self.label.config(text=format % args) self.label.update_idletasks() def clear(self): self.label.config(text="") self.label.update_idletasks() def log(self, text, clear=0): # Clear after clear seconds self.set('%s', text) if clear: self.label.after(clear * 1000, self.clear) class Command: """Courtesy of Danny Yoo http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/66521 """ def __init__(self, callback, *args, **kwargs): self.callback = callback self.args = args self.kwargs = kwargs def __call__(self): return apply(self.callback, self.args, self.kwargs) class Notebook: """Courtesy of Iuri Wickert http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/188537 """ # initialization. receives the master widget # reference and the notebook orientation def __init__(self, master, side=LEFT): self.active_fr = None self.count = 0 self.choice = IntVar(0) # allows the TOP and BOTTOM # radiobuttons' positioning. if side in (TOP, BOTTOM): self.side = LEFT else: self.side = TOP # creates notebook's frames structure self.rb_fr = Frame(master, borderwidth=2, relief=RIDGE) self.rb_fr.pack(side=side, fill=BOTH) self.screen_fr = Frame(master, borderwidth=2, relief=RIDGE) self.screen_fr.pack(fill=BOTH) # return a master frame reference for the external frames (screens) def __call__(self): return self.screen_fr # add a new frame (screen) to the (bottom/left of the) notebook def add_screen(self, fr, title): b = Radiobutton(self.rb_fr, text=title, indicatoron=0, \ variable=self.choice, value=self.count, \ command=lambda: self.display(fr)) b.pack(fill=BOTH, side=self.side) # ensures the first frame will be # the first selected/enabled if not self.active_fr: fr.pack(fill=BOTH, expand=1) self.active_fr = fr self.count += 1 # hides the former active frame and shows # another one, keeping its reference def display(self, fr): self.active_fr.forget() fr.pack(fill=BOTH, expand=1) self.active_fr = fr Reverend-0.4/reverend/__init__.py0000644000175000017500000000000010327506107016755 0ustar exarkunexarkunReverend-0.4/reverend/thomas.py0000755000175000017500000002434410417764262016545 0ustar exarkunexarkun# This module is part of the Divmod project and is Copyright 2003 Amir Bakhtiar: # amir@divmod.org. This is free software; you can redistribute it and/or # modify it under the terms of version 2.1 of the GNU Lesser General Public # License as published by the Free Software Foundation. # import operator import re import math from sets import Set class BayesData(dict): def __init__(self, name='', pool=None): self.name = name self.training = [] self.pool = pool self.tokenCount = 0 self.trainCount = 0 def trainedOn(self, item): return item in self.training def __repr__(self): return '' % (self.name, self.tokenCount) class Bayes(object): def __init__(self, tokenizer=None, combiner=None, dataClass=None): if dataClass is None: self.dataClass = BayesData else: self.dataClass = dataClass self.corpus = self.dataClass('__Corpus__') self.pools = {} self.pools['__Corpus__'] = self.corpus self.trainCount = 0 self.dirty = True # The tokenizer takes an object and returns # a list of strings if tokenizer is None: self._tokenizer = Tokenizer() else: self._tokenizer = tokenizer # The combiner combines probabilities if combiner is None: self.combiner = self.robinson else: self.combiner = combiner def commit(self): self.save() def newPool(self, poolName): """Create a new pool, without actually doing any training. """ self.dirty = True # not always true, but it's simple return self.pools.setdefault(poolName, self.dataClass(poolName)) def removePool(self, poolName): del(self.pools[poolName]) self.dirty = True def renamePool(self, poolName, newName): self.pools[newName] = self.pools[poolName] self.pools[newName].name = newName self.removePool(poolName) self.dirty = True def mergePools(self, destPool, sourcePool): """Merge an existing pool into another. The data from sourcePool is merged into destPool. The arguments are the names of the pools to be merged. The pool named sourcePool is left in tact and you may want to call removePool() to get rid of it. """ sp = self.pools[sourcePool] dp = self.pools[destPool] for tok, count in sp.items(): if dp.get(tok): dp[tok] += count else: dp[tok] = count dp.tokenCount += 1 self.dirty = True def poolData(self, poolName): """Return a list of the (token, count) tuples. """ return self.pools[poolName].items() def poolTokens(self, poolName): """Return a list of the tokens in this pool. """ return [tok for tok, count in self.poolData(poolName)] def save(self, fname='bayesdata.dat'): from cPickle import dump fp = open(fname, 'wb') dump(self.pools, fp) fp.close() def load(self, fname='bayesdata.dat'): from cPickle import load fp = open(fname, 'rb') self.pools = load(fp) fp.close() self.corpus = self.pools['__Corpus__'] self.dirty = True def poolNames(self): """Return a sorted list of Pool names. Does not include the system pool '__Corpus__'. """ pools = self.pools.keys() pools.remove('__Corpus__') pools = [pool for pool in pools] pools.sort() return pools def buildCache(self): """ merges corpora and computes probabilities """ self.cache = {} for pname, pool in self.pools.items(): # skip our special pool if pname == '__Corpus__': continue poolCount = pool.tokenCount themCount = max(self.corpus.tokenCount - poolCount, 1) cacheDict = self.cache.setdefault(pname, self.dataClass(pname)) for word, totCount in self.corpus.items(): # for every word in the copus # check to see if this pool contains this word thisCount = float(pool.get(word, 0.0)) if (thisCount == 0.0): continue otherCount = float(totCount) - thisCount if not poolCount: goodMetric = 1.0 else: goodMetric = min(1.0, otherCount/poolCount) badMetric = min(1.0, thisCount/themCount) f = badMetric / (goodMetric + badMetric) # PROBABILITY_THRESHOLD if abs(f-0.5) >= 0.1 : # GOOD_PROB, BAD_PROB cacheDict[word] = max(0.0001, min(0.9999, f)) def poolProbs(self): if self.dirty: self.buildCache() self.dirty = False return self.cache def getTokens(self, obj): """By default, we expect obj to be a screen and split it on whitespace. Note that this does not change the case. In some applications you may want to lowecase everthing so that "king" and "King" generate the same token. Override this in your subclass for objects other than text. Alternatively, you can pass in a tokenizer as part of instance creation. """ return self._tokenizer.tokenize(obj) def getProbs(self, pool, words): """ extracts the probabilities of tokens in a message """ probs = [(word, pool[word]) for word in words if word in pool] probs.sort(lambda x,y: cmp(y[1],x[1])) return probs[:2048] def train(self, pool, item, uid=None): """Train Bayes by telling him that item belongs in pool. uid is optional and may be used to uniquely identify the item that is being trained on. """ tokens = self.getTokens(item) pool = self.pools.setdefault(pool, self.dataClass(pool)) self._train(pool, tokens) self.corpus.trainCount += 1 pool.trainCount += 1 if uid: pool.training.append(uid) self.dirty = True def untrain(self, pool, item, uid=None): tokens = self.getTokens(item) pool = self.pools.get(pool, None) if not pool: return self._untrain(pool, tokens) # I guess we want to count this as additional training? self.corpus.trainCount += 1 pool.trainCount += 1 if uid: pool.training.remove(uid) self.dirty = True def _train(self, pool, tokens): wc = 0 for token in tokens: count = pool.get(token, 0) pool[token] = count + 1 count = self.corpus.get(token, 0) self.corpus[token] = count + 1 wc += 1 pool.tokenCount += wc self.corpus.tokenCount += wc def _untrain(self, pool, tokens): for token in tokens: count = pool.get(token, 0) if count: if count == 1: del(pool[token]) else: pool[token] = count - 1 pool.tokenCount -= 1 count = self.corpus.get(token, 0) if count: if count == 1: del(self.corpus[token]) else: self.corpus[token] = count - 1 self.corpus.tokenCount -= 1 def trainedOn(self, msg): for p in self.cache.values(): if msg in p.training: return True return False def guess(self, msg): tokens = Set(self.getTokens(msg)) pools = self.poolProbs() res = {} for pname, pprobs in pools.items(): p = self.getProbs(pprobs, tokens) if len(p) != 0: res[pname]=self.combiner(p, pname) res = res.items() res.sort(lambda x,y: cmp(y[1], x[1])) return res def robinson(self, probs, ignore): """ computes the probability of a message being spam (Robinson's method) P = 1 - prod(1-p)^(1/n) Q = 1 - prod(p)^(1/n) S = (1 + (P-Q)/(P+Q)) / 2 Courtesy of http://christophe.delord.free.fr/en/index.html """ nth = 1./len(probs) P = 1.0 - reduce(operator.mul, map(lambda p: 1.0-p[1], probs), 1.0) ** nth Q = 1.0 - reduce(operator.mul, map(lambda p: p[1], probs)) ** nth S = (P - Q) / (P + Q) return (1 + S) / 2 def robinsonFisher(self, probs, ignore): """ computes the probability of a message being spam (Robinson-Fisher method) H = C-1( -2.ln(prod(p)), 2*n ) S = C-1( -2.ln(prod(1-p)), 2*n ) I = (1 + H - S) / 2 Courtesy of http://christophe.delord.free.fr/en/index.html """ n = len(probs) try: H = chi2P(-2.0 * math.log(reduce(operator.mul, map(lambda p: p[1], probs), 1.0)), 2*n) except OverflowError: H = 0.0 try: S = chi2P(-2.0 * math.log(reduce(operator.mul, map(lambda p: 1.0-p[1], probs), 1.0)), 2*n) except OverflowError: S = 0.0 return (1 + H - S) / 2 def __repr__(self): return '' % [self.pools[p] for p in self.poolNames()] def __len__(self): return len(self.corpus) class Tokenizer: """A simple regex-based whitespace tokenizer. It expects a string and can return all tokens lower-cased or in their existing case. """ WORD_RE = re.compile('\\w+', re.U) def __init__(self, lower=False): self.lower = lower def tokenize(self, obj): for match in self.WORD_RE.finditer(obj): if self.lower: yield match.group().lower() else: yield match.group() def chi2P(chi, df): """ return P(chisq >= chi, with df degree of freedom) df must be even """ assert df & 1 == 0 m = chi / 2.0 sum = term = math.exp(-m) for i in range(1, df/2): term *= m/i sum += term return min(sum, 1.0) Reverend-0.4/README.txt0000644000175000017500000000236210327506107014545 0ustar exarkunexarkunReverend is a simple Bayesian classifier. It is designed to be easy to adapt and extend for your application. A simple example would look like: from reverend.thomas import Bayes guesser = Bayes() guesser.train('fish', 'salmon trout cod carp') guesser.train('fowl', 'hen chicken duck goose') guesser.guess('chicken tikka marsala') You can also "forget" some training: guesser.untrain('fish','salmon carp') The first argument of train is the bucket or class that you want associated with the training. If the bucket does not exists, Bayes will create it. The second argument is the object that you want Bayes to be trained on. By default, Bayes expects a string and uses something like string.split to break it into indidual tokens (words). It uses these tokens as the basis of its bookkeeping. The two ways to extend it are: 1. Pass in a function as the tokenizer when creating your Bayes. The function should expect one argument which will be whatever you pass to the train() method. The function should return a list of strings, which are the tokens that are relevant to your app. 2. Subclass Bayes and override the method getTokens to return a list of string tokens relevant to your app. I hope all you guesses are right, amir@divmod.org Reverend-0.4/changelog.txt0000644000175000017500000000172310327506107015537 0ustar exarkunexarkun25 November 2004 Release 0.3 Fixed error in calculation. Simpler regex tokenization. Now works with unicode. Removed split.py. 5 October 2003 Release 0.2.4 Added utility methods for removing, renaming and merging Pools: removePool(), renamePool() and mergePools() Also added utility methdos for inspecting pool data: poolData() and poolTokens() All of these methods take pool names as arguments. 25 Aug 2003 Release 0.2.3 Made it possible to pass an iterator of tokens. 16 Aug 2003 Release 0.2.2 Added ability to "forget" training using Bayes.untrain() 2 Aug 2003 Release 0.2.1 Removed the declaration of slots the tokenizer to make it play nice with Quotient. No change in functionality. 16 June 2003 Release 0.2 Added basic GUI for training and testing. Made the storage class pluggable, so different storage managers can be used. Some convenience functions and better repr. Removed some code that was not being run. 18 May 2003 Release 0.1 Initial release Reverend-0.4/setup.py0000644000175000017500000000215111304541016014547 0ustar exarkunexarkun# This module is part of the Reverend project and is Copyright 2003 Amir # Bakhtiar (amir@divmod.org). This is free software; you can redistribute # it and/or modify it under the terms of version 2.1 of the GNU Lesser # General Public License as published by the Free Software Foundation. from distutils.core import setup setup(name="Reverend", version="0.4", description="Divmod Reverend - a simple Bayesian classifier", author="Amir Bakhtiar", author_email="amir hat divmod point org", url="http://www.divmod.org/", packages=['reverend', 'reverend.ui', 'reverend.guessers'], classifiers=[ "Development Status :: 7 - Inactive", "Intended Audience :: Developers", "License :: OSI Approved :: GNU Library or Lesser General Public License (LGPL)", "Natural Language :: English", "Programming Language :: Python", "Topic :: Communications :: Email :: Filters", "Topic :: Software Development :: Libraries :: Python Modules", "Topic :: Text Processing", ]) Reverend-0.4/PKG-INFO0000644000175000017500000000126111304542342014136 0ustar exarkunexarkunMetadata-Version: 1.0 Name: Reverend Version: 0.4 Summary: Divmod Reverend - a simple Bayesian classifier Home-page: http://www.divmod.org/ Author: Amir Bakhtiar Author-email: amir hat divmod point org License: UNKNOWN Description: UNKNOWN Platform: UNKNOWN Classifier: Development Status :: 7 - Inactive Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU Library or Lesser General Public License (LGPL) Classifier: Natural Language :: English Classifier: Programming Language :: Python Classifier: Topic :: Communications :: Email :: Filters Classifier: Topic :: Software Development :: Libraries :: Python Modules Classifier: Topic :: Text Processing