I've used pyparsing for limited vocabulary command parsing, but here is a little framework on top of pyparsing that addresses your posted example:
from pyparsing import *
transVerb, transVerbPlural, transVerbPast, transVerbProg = (Forward() for i in range(4))
intransVerb, intransVerbPlural, intransVerbPast, intransVerbProg = (Forward() for i in range(4))
singNoun,pluralNoun,properNoun = (Forward() for i in range(3))
singArticle,pluralArticle = (Forward() for i in range(2))
verbProg = transVerbProg | intransVerbProg
verbPlural = transVerbPlural | intransVerbPlural
for expr in (transVerb, transVerbPlural, transVerbPast, transVerbProg,
intransVerb, intransVerbPlural, intransVerbPast, intransVerbProg,
singNoun, pluralNoun, properNoun, singArticle, pluralArticle):
expr << MatchFirst([])
def appendExpr(e1, s):
c1 = s[0]
e2 = Regex(r"[%s%s]%s\b" % (c1.upper(), c1.lower(), s[1:]))
e1.expr.exprs.append(e2)
def makeVerb(s, transitive):
v_pl, v_sg, v_past, v_prog = s.split()
if transitive:
appendExpr(transVerb, v_sg)
appendExpr(transVerbPlural, v_pl)
appendExpr(transVerbPast, v_past)
appendExpr(transVerbProg, v_prog)
else:
appendExpr(intransVerb, v_sg)
appendExpr(intransVerbPlural, v_pl)
appendExpr(intransVerbPast, v_past)
appendExpr(intransVerbProg, v_prog)
def makeNoun(s, proper=False):
if proper:
appendExpr(properNoun, s)
else:
n_sg,n_pl = (s.split() + [s+"s"])[:2]
appendExpr(singNoun, n_sg)
appendExpr(pluralNoun, n_pl)
def makeArticle(s, plural=False):
for ss in s.split():
if not plural:
appendExpr(singArticle, ss)
else:
appendExpr(pluralArticle, ss)
makeVerb("disappear disappears disappeared disappearing", transitive=False)
makeVerb("walk walks walked walking", transitive=False)
makeVerb("see sees saw seeing", transitive=True)
makeVerb("like likes liked liking", transitive=True)
makeNoun("dog")
makeNoun("girl")
makeNoun("car")
makeNoun("child children")
makeNoun("Kim", proper=True)
makeNoun("Jody", proper=True)
makeArticle("a the")
makeArticle("this every")
makeArticle("the these all some several", plural=True)
transObject = (singArticle + singNoun | properNoun | Optional(pluralArticle) + pluralNoun | verbProg | "to" + verbPlural)
sgSentence = (singArticle + singNoun | properNoun) + (intransVerb | intransVerbPast | (transVerb | transVerbPast) + transObject)
plSentence = (Optional(pluralArticle) + pluralNoun) + (intransVerbPlural | intransVerbPast | (transVerbPlural |transVerbPast) + transObject)
sentence = sgSentence | plSentence
def test(s):
print s
try:
print sentence.parseString(s).asList()
except ParseException, pe:
print pe
test("Kim likes cars")
test("The girl saw the dog")
test("The dog saw Jody")
test("Kim likes walking")
test("Every girl likes dogs")
test("All dogs like children")
test("Jody likes to walk")
test("Dogs like walking")
test("All dogs like walking")
test("Every child likes Jody")
Prints:
Kim likes cars
['Kim', 'likes', 'cars']
The girl saw the dog
['The', 'girl', 'saw', 'the', 'dog']
The dog saw Jody
['The', 'dog', 'saw', 'Jody']
Kim likes walking
['Kim', 'likes', 'walking']
Every girl likes dogs
['Every', 'girl', 'likes', 'dogs']
All dogs like children
['All', 'dogs', 'like', 'children']
Jody likes to walk
['Jody', 'likes', 'to', 'walk']
Dogs like walking
['Dogs', 'like', 'walking']
All dogs like walking
['All', 'dogs', 'like', 'walking']
Every child likes Jody
['Every', 'child', 'likes', 'Jody']
This is likely to get slow as you expand the vocabulary. Half a million entries? I thought that a reasonable functional vocabulary was on the order of 5-6 thousand words. And you will be pretty limited in the sentence structures that you can handle - natural language is what NLTK is for.