#!/usr/bin/python # AskMerlin is a weekend hack I did by putting together a short program called NLQ, # or natural language query, which can be found online at http://gurno.com/adam/nlq/#download # NLQ is a Class to take an inputed query and output 1. Kewords and 2. also to categorize # the type of question being asked. I am primarily interested in using the Keywords # extracted form a query by NLQ. I shamellesly modified NLQ to add many more # IGNORE_WORDS and otherwise spruce it up. # Next, I ultilized the multiChoise Guesser script that some nice person posted # on the newsgroup a couple weesk ago. This uses urllib to go out to the web # and judge teh appropriateness of a given answer by how many hits it gets on Google # when coupled with the origninal question in a Googel search. # My main contribution, such as it was, was simply to enable the program to create # choices in order to choose a most appropriate answer. This is done by using # NLQ to pick out Keywords form teh HTML page returned by Google in its search # on the question. This version, AskMerlin, currently takes a long time # to come to an answer because it must do amny web searches/ I have also a simpler version # called aNoDivisorAskMerlin that comes to a much quicker decision, btu is not as good in the # appropriateness of its answer's. In either case, you must wait pateintly while # merlin considers his options...dependign on speed of yoru internet connection. # On my cable modem at home, AsKMerlin takes about three minutes. # I am hoping that someone or other might give me ideas on how to improve the # intelligence behind Merlin's delibrations. This version is just a hack to see if it works # in principle. I have other ideas to increase "intelligence" but I need mroe. # Currently, Merlin is very low IQ, but he has potential for the future. # Anyway, Merlin can already answer just about any question. # Someday, perhaps he will even answer correctly or at least with wisdom # ;-))))))))))))) # NLQ: # This is just a bunch of ideas tossed together by someone who doesn't know # anything about NLQs or even if these "Natural Language Queries" are referred # to as NLQs. Take it all with a grain of salt. # I needed something that could respond semi-intelligently to questions posed to # it. Quick and Dirty as they say. # NLQ.py is still rather dumb. Don't get fancy. "Would you please explain to # me why the sky is blue?" will not get you what you want. "Why is the sky # blue?" is much better. import urllib import re import string, sys # stuff __version__ = "0.1" #definine the question types... UNKNOWN = 0 KNOWLEDGE = 1 COMPREHENSION = 2 APPLICATION = 3 ANALYSIS = 4 SYNTHESIS = 5 EVALUATION = 6 KNOWLEDGE_WORDS = ["name", "list", "recall", "define", "tell", "match", "who", "what", "when", "describe", "where"] COMPREHENSION_WORDS = ["retell"] APPLICATION_WORDS = ["why"] ANALYSIS_WORDS = ["how", "classify", "outline", "diagram"] SYNTHESIS_WORDS = [] EVALUATION_WORDS = [] PRONOUNS = ["he", "she", "it", "me", "you", "they", "them", "we", "who", "myself", "yourself", "ourself", "I", "me", "my"] VERBS = ["is", "was", "are", "were", "be", "shall", "am", "isn't", "can't", "won't", "shouldn't", "couldn't", "aren't", "do", "don't", ] OTHER_WORDS = ["if", "to", "too", "there", "will", "the", "a", "let", "I'll", "this", "these", "those", "let", "*.", "+*", ".*", "<*", ">*", "=*", "*=", "*<", "*>", "*.", "*-", "-*", "*:", ":*", ";*", "*;", "*,", ",*", "*.*", "*,*", "*;*", "*:*", "*+*", "*=*", "*-*", "*_*", "*<*", "*>*", "*?*", "*/*", "of", "and", "for", "very", "not", "in", "on", "up", "has", "from", "which", "and", "on", "of", "or", "not", "by", "can", "that", "your", "with", "their", "over", "back", "link", "about", "an", "at", "his", "enter", "into", "so", "was", "a", "as"] IGNORE_WORDS = VERBS + PRONOUNS + OTHER_WORDS + KNOWLEDGE_WORDS + COMPREHENSION_WORDS + APPLICATION_WORDS + ANALYSIS_WORDS def determine_type (word): # for right now this only matches the first word. Soon it will # take the whole string and attempt to match using that. return_type = UNKNOWN if word in KNOWLEDGE_WORDS: return_type = KNOWLEDGE elif word in APPLICATION_WORDS: return_type = APPLICATION elif word in ANALYSIS_WORDS: return_type = ANALYSIS elif word in SYNTHESIS_WORDS: return_type = SYNTHESIS elif word in EVALUATION_WORDS: return_type = EVALUATION elif word in COMPREHENSION_WORDS: return_type = COMPREHENSION return return_type class NLQ: def __init__(self, a_string): self.tuple = string.split(string.lower(a_string)) self.type = determine_type (self.tuple[0]) self.keywords = [] for word in self.tuple[1:]: if "~" in word: continue if "@" in word: continue if "#" in word: continue if "$" in word: continue if "%" in word: continue if "^" in word: continue if "&" in word: continue if "<" in word: continue if ">" in word: continue if ":" in word: continue if ";" in word: continue if "{" in word: continue if "}" in word: continue if "[" in word: continue if "*" in word: continue if "(" in word: continue if ")" in word: continue if "_" in word: continue if "-" in word: continue if "+" in word: continue if "=" in word: continue if "?" in word: continue if "for" == word: continue if word in IGNORE_WORDS: continue if word in OTHER_WORDS: continue if word in VERBS: continue if word in PRONOUNS: continue if "and" == word: continue if word[0] not in string.letters: continue if word[-1] not in string.letters: word = word[:-1] else: self.keywords.append (word) def __repr__(self): return "type: %s\nkeywords: %s" % (self.type, self.keywords) class multiChoiceGuesser: def __init__(self, question='', replys=[]): self.question = question self.replys = replys def guessedAnswer(self): hits = [] result = [] for reply in self.replys: x = (self._getGoogleHits(self.question + ' ' + reply)) y = (self._getGoogleHits(reply)) if y == 0: y = y + 1 dividend = x / y hits.append(dividend) return hits.index(max(hits)) def _getGoogleHits(self, query): query = urlencode({'q':query}) urlHandle = urlopen ('http://www.google.com/search?%s' % query) googlePage = urlHandle.read() try: numberAsString = re.search( 'about', googlePage, re.S ).group(1) hits = re.sub(',', '',numberAsString) urlHandle.close() hits = int(hits) except: hits = 0 return hits def _getGoogleHits(self, query): query = urllib.urlencode({'q':query}) urlHandle = urllib.urlopen('http://www.google.com/search?%s' % query) googlePage = urlHandle.read() try: numberAsString = re.search( 'about', googlePage, re.S ).group(1) hits = re.sub (',', '',numberAsString) urlHandle.close() hits = int(hits) except: hits = 0 return hits def _getGooglePage(query): query = urllib.urlencode({'q':query}) urlHandle = urllib.urlopen('http://www.google.com/search?%s' % query) googlePage = urlHandle.read() return googlePage def guess(question, replys): mcg = multiChoiceGuesser(question, replys) print 'The question is: ', question print 'The most likely answer is: ', replys[mcg.guessedAnswer()] print '' question = raw_input ("What is your question?") source = _getGooglePage(question) b = NLQ(source) replys = b.keywords u = NLQ(question) bad = u.keywords for thing in bad: replys.remove(thing) del replys[:13] del replys[-13:] print replys guess(question, replys)