@MISC\{IMM2012-06344, author = "J. v. Beusekom and P. G. Poulsen", title = "Textual similarity", year = "2012", publisher = "Technical University of Denmark, {DTU} Informatics, {E-}mail: reception@imm.dtu.dk", address = "Asmussens Alle, Building 305, {DK-}2800 Kgs. Lyngby, Denmark", note = "Supervised by Professor Robin Sharp, ris@imm.dtu.dk, {DTU} Informatics", url = "http://www.imm.dtu.dk/English.aspx", abstract = "The goal of the thesis is to try out different algorithms intended for measuring semantic similarity between documents. In order to do this, a tool Similarity Tool has been developed in Java. The tool has four implemented algorithms that all can be run on a set of documents to compute the similarity scores between pairs of documents. To test out how accurately an algorithm solves the problem, similarity scores have been assigned to the pairs of documents in a set by both humans and algorithms and the correlation coefficients between the results have been calculated. The structure of the tool is discussed and the algorithms are then analyzed in terms of time and space complexity as well as accuracy. It is concluded that each algorithm has its own advantages and that it is possible to achieve satisfying results with all algorithms by using certain preprocessing methods." }