@CONFERENCE\{IMM2004-02984, author = "R. E. Madsen and S. Sigurdsson and L. K. Hansen and J. Larsen", title = "Vocabulary Pruning for Improved Context Recognition", year = "2004", month = "aug", keywords = "information gain, sensitivity, neural networks, text classification, dimensionality reduction", pages = "80-85", booktitle = "Proceedings of the International Joint Conference on Neural Networks", volume = "", series = "", editor = "", publisher = "{IEEE} Press", organization = "", address = "", note = "special session on machine learning for text mining", url = "http://www2.compute.dtu.dk/pubdb/pubs/2984-full.html", abstract = "Language independent `bag-of-words' representations are surprisingly effective for text classification. The representation is high dimensional though, containing many non-consistent words for text categorization. These non-consistent words result in reduced generalization performance of subsequent classifiers, e.g., from ill-posed principal component transformations. In this communication our aim is to study the effect of reducing the least relevant words from the bag-of-words representation. We consider a new approach, using neural network based sensitivity maps and information gain for determination of term relevancy, when pruning the vocabularies. With reduced vocabularies documents are classified using a latent semantic indexing representation and a probabilistic neural network classifier. Reducing the bag-of-words vocabularies with 90\%-98\%, we find consistent classification improvement using two mid size data-sets. We also study the applicability of information gain and sensitivity maps for automated keyword generation." }