#!/usr/bin/python #OK, I had to re-write askMerlin form scratch, due to Google's #disabling of the functionality that the program relied on. It # seesm that merlin was violating Google's terms of service, #unbeknownst to me, that's for sure! But I do not want to violate #anyone's terms of service, so I re-wrote askMerlin using # Yahoo instead. I also came up with a completely new #web-scraping algorithm, using string functions instead # of Regular Expressions. Simple is better than complicated! # # AskMerlin is a script I did by putting together two scripts and #modfying them both # and adding input/output routines around them. # # First, I ultilized the multiChoiceGuesser script that Max M posted # on the newsgroup comp.lang.python a couple weeks ago. This uses urllib #to go out to # the web and judge the appropriateness of a given answer by how many #hits it gets on Google # when coupled with the origninal question in a Google search. # My contributions were to enable the program to ask # for both an original question, and then for options to choose from. I #also set up a small # routine in order to choose a most appropriate answer, in the case that #no options are given. # This is done by using the second program, to create options of its own #to choose from # NLQ to pick out Keywords from the page returned by a Google search of #the question, by itself. # Then, these keyworsa are used as options or possible answers to the #question. # Then, multiChoiceGuesser is applied to the question along with all of #the Keywords # generated by NLQ. The result can take a long time, but eventually it #gets there, always. (???) # Also, I added to multiChoiceGuesser the requirement to do two google #searches, one on # the original question and each option, and one on the option by #itlself. Then # we calculate a ratio between each option's Google hit score and its #question/option # Google hit score, thus avoiding merely choosing the option that has #overwhelmingly high hits # all by itself. # # Surely better algorithms can vastly improve thsi program!!! # # I am hoping some one or some folks come up with improved variatiosn #and algorithtms # # Various algoritms could be tried, and then the results from the #various algoritms could be # averaged in order to produce more accurate results. # # # Currently, Merlin is may have a low IQ, but he has potential for the #future. # Anyway, Merlin can already answer just about any question. # Someday, perhaps he will even answer correctly or at least with #wisdom. most or all # of the time. # ;-))))))))))))) # # # NLQ: # a short program called NLQ, # or natural language query, which can be found online at #http://gurno.com/adam/nlq/#download # NLQ is a Class to take an inputted query and output 1. Keywords and 2. #also to categorize # the type of question being asked. I am primarily interested in using #the Keywords # extracted from a query by NLQ. I shamelesly modified NLQ to add many #more # IGNORE_WORDS and otherwise spruce it up. # # NLQ.py is still rather dumb, but hey, he has potential ;-))))). from urllib import * import re import string, sys # stuff __version__ = "0.1" #definine the question types... UNKNOWN = 0 KNOWLEDGE = 1 COMPREHENSION = 2 APPLICATION = 3 ANALYSIS = 4 SYNTHESIS = 5 EVALUATION = 6 KNOWLEDGE_WORDS = ["name", "list", "recall", "define", "tell", "match", "who", "what", "when", "describe", "where"] COMPREHENSION_WORDS = ["retell"] APPLICATION_WORDS = ["why"] ANALYSIS_WORDS = ["how", "classify", "outline", "diagram"] SYNTHESIS_WORDS = [] EVALUATION_WORDS = [] PRONOUNS = ["he", "she", "it", "me", "you", "they", "them", "we", "who", "myself", "yourself", "ourself", "I", "me", "my"] VERBS = ["is", "was", "are", "were", "be", "shall", "am", "isn't", "can't", "won't", "shouldn't", "couldn't", "aren't", "do", "don't", ] OTHER_WORDS = ["if", "to", "too", "there", "will", "the", "a", "let", "I'll", "this", "these", "those", "let", "*.", "+*", ".*", "<*", ">*", "=*", "*=", "*<", "*>", "*.", "*-", "-*", "*:", ":*", ";*", "*;", "*,", ",*", "*.*", "*,*", "*;*", "*:*", "*+*", "*=*", "*-*", "*_*", "*<*", "*>*", "*?*", "*/*", "of", "and", "for", "very", "not", "in", "on", "up", "has", "from", "which", "and", "on", "of", "or", "not", "by", "can", "that", "your", "with", "their", "over", "back", "link", "about", "an", "at", "his", "enter", "into", "so", "was", "a", "as", "but"] IGNORE_WORDS = VERBS + PRONOUNS + OTHER_WORDS + KNOWLEDGE_WORDS + COMPREHENSION_WORDS + APPLICATION_WORDS + ANALYSIS_WORDS def determine_type (word): # for right now this only matches the first word. Soon it will # take the whole string and attempt to match using that. return_type = UNKNOWN if word in KNOWLEDGE_WORDS: return_type = KNOWLEDGE elif word in APPLICATION_WORDS: return_type = APPLICATION elif word in ANALYSIS_WORDS: return_type = ANALYSIS elif word in SYNTHESIS_WORDS: return_type = SYNTHESIS elif word in EVALUATION_WORDS: return_type = EVALUATION elif word in COMPREHENSION_WORDS: return_type = COMPREHENSION return return_type class NLQ: def __init__(self, a_string): self.tuple = string.split(string.lower(a_string)) self.type = determine_type (self.tuple[0]) self.keywords = [] for word in self.tuple[1:]: if "~" in word: continue if "@" in word: continue if "#" in word: continue if "$" in word: continue if "%" in word: continue if "^" in word: continue if "&" in word: continue if "<" in word: continue if ">" in word: continue if ":" in word: continue if ";" in word: continue if "{" in word: continue if "}" in word: continue if "[" in word: continue if "*" in word: continue if "(" in word: continue if ")" in word: continue if "_" in word: continue if "-" in word: continue if "+" in word: continue if "=" in word: continue if "?" in word: continue if "for" == word: continue if word in IGNORE_WORDS: continue if word in OTHER_WORDS: continue if word in VERBS: continue if word in PRONOUNS: continue if "and" == word: continue if word[0] not in string.letters: continue if word[-1] not in string.letters: word = word[:-1] else: self.keywords.append (word) def __repr__(self): return "type: %s\nkeywords: %s" % (self.type, self.keywords) class multiChoiceGuesser: def __init__(self, question='', replys=()): self.question = question self.replys = replys def guessedAnswer(self): hits = [] result = [] for reply in self.replys: x = (self._getGoogleHits(self.question + ' ' + reply)) y = (self._getGoogleHits(reply)) x = float(x) y = float(y) if x == 0: x = x + 1 dividend = y / x hits.append(dividend) return hits.index(min(hits)) def _getGoogleHits(self, query): query = urlencode({'p':query}) urlHandle = urlopen('http://search.yahoo.com/bin/search?%s' % (query)) googlePage = urlHandle.read() position = googlePage.index("out of about") new = googlePage[position:] neww = new[13:] newww = neww[:26] spot = newww.index("<") eureka = newww[:spot] hits = re.sub(',', '', eureka) urlHandle.close() hits = int(hits) return hits def guess(question, choices): mcg = multiChoiceGuesser(question, choices) print ' The question is: ', question print " Please wait for Merlin's answer: ", choices[mcg.guessedAnswer()] print '' def get_list(heading, prompt): print heading print print "(enter a blank line to end the list)" ret = [] i = 1 while 1: line = raw_input(prompt % i) if not line: break ret.append(line) i=i+1 print return ret question = raw_input ("What is your question?") choices = get_list("Enter your options:", "Option %d: ") if choices == []: print """Since you did not give Merlin any options, it may take a while as he thinks. Please be patient; if you do not touch your keyboard or mouse for a few minutes, Merlin will respond ;-)))))""" urlHandle = urlopen('http://search.yahoo.com/bin/search?%s' % (question)) source = urlHandle.read() b = NLQ(source) choices = b.keywords u = NLQ(question) bad = u.keywords del choices[:1] del choices[-1:] guess(question, choices) while 1: question = raw_input ("what is your next question?") choices = get_list("Enter your options:", "Option %d: ") if not choices: print """Since you did not give Merlin any options, it may take a while as he thinks. Please be patient and if you do not touch your keyboard or mouse for a few minutes, Merlin will respond.""" urlHandle = urlopen('http://search.yahoo.com/bin/search?%s' % (question)) source = urlHandle.read() b = NLQ(source) choices = b.keywords u = NLQ(question) bad = u.keywords del choices[:1] del choices[-1:] guess(question, choices)