#!/usr/bin/python # This is just a bunch of ideas tossed together by someone who doesn't know # anything about NLQs or even if these "Natural Language Queries" are referred # to as NLQs. Take it all with a grain of salt. # I needed something that could respond semi-intelligently to questions posed to # it. Quick and Dirty as they say. # NLQ.py is still rather dumb. Don't get fancy. "Would you please explain to # me why the sky is blue?" will not get you what you want. "Why is the sky # blue?" is much better. import urllib import re import string, sys # stuff __version__ = "0.1" #definine the question types... UNKNOWN = 0 KNOWLEDGE = 1 COMPREHENSION = 2 APPLICATION = 3 ANALYSIS = 4 SYNTHESIS = 5 EVALUATION = 6 KNOWLEDGE_WORDS = ["name", "list", "recall", "define", "tell", "match", "who", "what", "when", "describe", "where"] COMPREHENSION_WORDS = ["retell"] APPLICATION_WORDS = ["why"] ANALYSIS_WORDS = ["how", "classify", "outline", "diagram"] SYNTHESIS_WORDS = [] EVALUATION_WORDS = [] PRONOUNS = ["he", "she", "it", "me", "you", "they", "them", "we", "who", "myself", "yourself", "ourself", "I", "me", "my", "his"] VERBS = ["is", "was", "are", "were", "be", "shall", "am", "isn't", "can't", "won't", "shouldn't", "couldn't", "aren't", "do", "don't", ] OTHER_WORDS = ["if", "to", "too", "there", "will", "the", "a", "let", "I'll", "this", "these", "those", "let", "*.", "+*", ".*", "<*", ">*", "=*", "*=", "*<", "*>", "*.", "*-", "-*", "*:", ":*", ";*", "*;", "*,", ",*", "*.*", "*,*", "*;*", "*:*", "*+*", "*=*", "*-*", "*_*", "*<*", "*>*", "*?*", "*/*", "of", "and", "for", "very", "not", "in", "on", "up", "has", "from", "which", "and", "on", "of", "or", "not", "by", "can", "that", "your", "with", "their", "over", "back", "link", "about", "an", "at", "his", "into", "enter", "so", "was", "a", "as"] IGNORE_WORDS = VERBS + PRONOUNS + OTHER_WORDS + KNOWLEDGE_WORDS + COMPREHENSION_WORDS + APPLICATION_WORDS + ANALYSIS_WORDS def determine_type (word): # for right now this only matches the first word. Soon it will # take the whole string and attempt to match using that. return_type = UNKNOWN if word in KNOWLEDGE_WORDS: return_type = KNOWLEDGE elif word in APPLICATION_WORDS: return_type = APPLICATION elif word in ANALYSIS_WORDS: return_type = ANALYSIS elif word in SYNTHESIS_WORDS: return_type = SYNTHESIS elif word in EVALUATION_WORDS: return_type = EVALUATION elif word in COMPREHENSION_WORDS: return_type = COMPREHENSION return return_type class NLQ: def __init__(self, a_string): self.tuple = string.split(string.lower(a_string)) self.type = determine_type (self.tuple[0]) self.keywords = [] for word in self.tuple[1:]: if "~" in word: continue if "@" in word: continue if "#" in word: continue if "$" in word: continue if "%" in word: continue if "^" in word: continue if "&" in word: continue if "<" in word: continue if ">" in word: continue if ":" in word: continue if ";" in word: continue if "{" in word: continue if "}" in word: continue if "[" in word: continue if "*" in word: continue if "(" in word: continue if ")" in word: continue if "_" in word: continue if "-" in word: continue if "+" in word: continue if "=" in word: continue if "?" in word: continue if "for" == word: continue if word in IGNORE_WORDS: continue if word in OTHER_WORDS: continue if word in VERBS: continue if word in PRONOUNS: continue if "and" == word: continue if word[0] not in string.letters: continue if word[-1] not in string.letters: word = word[:-1] else: self.keywords.append (word) def __repr__(self): return "type: %s\nkeywords: %s" % (self.type, self.keywords) class multiChoiceGuesser: def __init__(self, question='', replys=[]): self.question = question self.replys = replys def guessedAnswer(self): hits = [] for reply in self.replys: hits.append(self._getGoogleHits(self.question + ' ' + reply)) return hits.index(max(hits)) def _getGoogleHits(self, query): query = urllib.urlencode({'q':query}) urlHandle = urllib.urlopen('http://www.google.com/search?%s' % query) googlePage = urlHandle.read() try: numberAsString = re.search( 'of about <b>(.*?)</b>.', googlePage, re.S ).group(1) hits = re.sub(',', '',numberAsString) urlHandle.close() hits = int(hits) except: hits = 0 return hits def _getGooglePage(query): query = urllib.urlencode({'q':query}) urlHandle = urllib.urlopen('http://www.google.com/search?%s' % query) googlePage = urlHandle.read() return googlePage def guess(question, replys): mcg = multiChoiceGuesser(question, replys) print 'The question is: ', question print 'The most likely answer is: ', replys[mcg.guessedAnswer()] print '' question = raw_input ("What is your question?") source = _getGooglePage(question) b = NLQ(source) replys = b.keywords u = NLQ(question) bad = u.keywords for thing in bad: replys.remove(thing) del replys[:13] del replys[-13:] print replys guess(question, replys)