@CONFERENCE\{IMM2005-03996, author = "R. E. Madsen and D. Kauchak and C. Elkan", title = "Modeling Word Burstiness Using the Dirichlet Distribution", year = "2005", month = "jun", keywords = "Text mining, {DCM,} Polya, Multinomial, Categorization, Supervised", pages = "489--498", booktitle = "International Conference on Machine Learning", volume = "", series = "", editor = "", publisher = "", organization = "", address = "", url = "http://www2.compute.dtu.dk/pubdb/pubs/3996-full.html", abstract = "Multinomial distributions are often used to model text documents. However, they do not capture well the phenomenon that words in a document tend to appear in bursts: if a word appears once, it is more likely to appear again. In this paper, we propose the Dirichlet compound multinomial model (DCM) as an alternative to the multinomial. The {DCM} model has one additional degree of freedom, which allows it to capture burstiness. We show experimentally that the {DCM} is substantially better than the multinomial at modeling text data, measured by perplexity. We also show using three standard document collections that the {DCM} leads to better classification than the multinomial model. {DCM} performance is comparable to that obtained with multiple heuristic changes to the multinomial model." }