@TECHREPORT\{IMM2004-03998, author = "R. E. Madsen", title = "Modeling Text using State Space Models", year = "2004", keywords = "State space, Hidden Markov Model, Text Mining, {HMM,} {LSI,} {GMM}", number = "", series = "", institution = "", address = "", type = "", url = "http://www2.compute.dtu.dk/pubdb/pubs/3998-full.html", abstract = "Generic “bag-of-words” text categorization methods are only based on the information contained in word count histograms. These methods does therefore not capture the information contained in the order in which the words appear in a document. We here consider models that is acting on both parts of information at the same time, that is the information about what words appear and in what order they appear. State-space models has the ability to capture information from the order in which the words appear, and combine it with the word appearance probabilities. The state-space models should therefore conceptually super-seed the bag-of-words/vector-space models, in ability to model documents correctly. In the following we experiment with two state space model approaches, for making categorization better." }