@MISC\{IMM2012-06466, author = "C. Ahrensberg", title = "Automatic Recognition of Blog Entries", year = "2012", publisher = "Technical University of Denmark, {DTU} Informatics, {E-}mail: reception@imm.dtu.dk", address = "Asmussens Alle, Building 305, {DK-}2800 Kgs. Lyngby, Denmark", note = "{DTU} supervisors: J{\o}rgen Villadsen, jv@imm.dtu.dk, and Patrick Hagge Cording, phaco@imm.dtu.dk, {DTU} Informatics", url = "http://www.imm.dtu.dk/English.aspx", abstract = "The goal of the thesis is to determine if a page can be recognized as a specific type of page based on the structure of its {HTML} elements. It will try to do so by using Tree Edit Distance to generate a matching structure from said pages structures which then in turn can be used to test against when an arbitrary page is presented, thus answering if the page is a Wordpress blog or not. The algorithm used is the Restricted Top Down Mapping which imposes restrictions enforcing the DocType of {HTML} while mapping from one tree to another. A series of test will be run on the algorithm to determine its precision when answering if a site is a blog or not." }