Remove type of the document (book, article, etc.) It's not important. And use of...
[propref.git] / extractor / extract_syntactic.go
blob143a32b341a632af2bc1586f965be2d0dd6711d6
1 /*
2 extract_syntactic: extract information included in the file
3 It's based strongly in the syntax of the file.
5 Copyright (C) 2014 Xavier <quatrilio@gmail.com> and other authors.
6 See CONTRIBUTORS file.
8 This program is free software: you can redistribute it and/or modify
9 it under the terms of the GNU Affero General Public License as
10 published by the Free Software Foundation, either version 3 of the
11 License, or (at your option) any later version.
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU Affero General Public License for more details.
18 You should have received a copy of the GNU Affero General Public License
19 along with this program. If not, see <http://www.gnu.org/licenses/>.
21 Website: https://github.com/quatrilio/propref
27 extractor is a library for extracting information about mathematical documents.
28 The information could be: a) syntactic: within the document, like title. b) Meta-informational: typically filesystem information, like size or output from `file` command. c) Contextual: depending of whole documents, as a network [1]. Like the impact index of the document or the popularity of the propositions.
30 [1] "The information provided by a network itself is different than the sum of the information provided by each individual element in the network" is one of the conclusions from reading the book "Redes Complejas" of Ricard Solé (spanish).
33 package extractor
35 import (
36 "regexp"
37 "log"
38 "errors"
39 "github.com/quatrilio/propref/tools"
42 // List of strings we search for in the source documents, by syntax
43 // BUG(#3): latex commands could have space, return carriages inside .... But only just one return carriage.
44 // BUG(#5): it does not work with arxiv:1401.5978v1
45 // BUG(#4): LaTeX: \documentclass{amsart} and others incorporate amsthm (See http://www.ctan.org/pkg/amsart)
46 var regs map[string]string = map[string]string {
47 "latex": `(\\usepackage(?:\[\w*(?:,\w*)?\])??{(?:\w*,)?(?P<usepkg_amsmath>amsmath)(?:,\w*)?})`, // for LaTeX
50 // SInfo represents syntactic information of the document. It consists of a common fields of mathematical documents.
51 type SInfo struct {
53 Doctype string //Type of the document (latex, context, etc.). Always is lowercase
54 Subdoctype string //Subtype of the document (for example, amslatex, plain, amsart)
55 Title string //Title of the document
56 Authors []Author //list of authors of the document, ordered by their position in the text
57 Date string //when the document was written. In format "YYYY-MM-DD" (Gregorian calendar)
58 Resume string //the abstract
59 License string //the license of the document
60 MSC []MSC //Mathematical Subject Classification list items
61 Keywords []string //Most important words of the document
62 Words []string //Set (not list) of all words
63 Language string //The language of the document
64 References []Reference //References appearing in the bibliography
65 Statements []Statement //List of all statements
68 // Author is related to author information. By now, it's just a string, with his or her complete name (including alias if any).
69 type Author struct {
71 Name string //Whole name of the author
74 // MSC represents an Mathematical Subject Classification item.
75 type MSC struct {
77 Version uint // the year of revision of the MSC which is used in the document. Eg. 2010
78 Type string // "primary" or "secondary".
79 Code string // code. eg "05A10"
82 // Reference implements information of a reference. A reference is basically an information of document which has more information about whatever we said.
83 type Reference struct {
85 Author []Author //the authors of the document
86 Title string //the title of the document
87 Date string //date which is published or created
88 URL string //the URL in which we found
92 // Statement implements units of information in mathematical documents: concluding items (like theorems, lemmas, corollaries, etc.) in which we arrive at conclusions about objects, a premise item (like definitions, notation, axioms, hyphotesis, conditions) in which we what we need before reasoning, or a remark item (conjecture, problem, note) in which we just note or remark.
93 type Statement struct {
95 Type string //type of the statement
96 Argument Argument //for propositions only
97 Assertion string //what is said
98 Name string //things could have optionally names
99 References []Reference //if the statements points to a references for better understanding
100 Documents []Reference //list of documents in which the statement is present
104 // Argument is composed by the hypothesis and the thesis, which are parts of one proposition.
105 type Argument struct {
106 Hipothesis string
107 Thesis string
110 // Extract_Latex extracts SInfo from LaTeX source files.
111 func Extract_Latex(contents string) (*SInfo, error) {
113 // Compile and search regular expressions depending on syntax
114 re, err := regexp.Compile(regs["latex"])
116 if err != nil {
117 log.Fatal(err)
118 return nil, errors.New("[PropRef:extract_syntactic:Extract_Latex] " + err.Error())
121 result := SInfo{}
122 // Extract all the substrings from contents
123 a := tools.FindStringSubmatchMap(re, contents)
124 result.Doctype = "latex"
125 result.Subdoctype = a["usepkg_amsmath"]
126 return &result, nil
131 // Extract_Syntax extracts information from the string
132 // according to the specified syntax.
133 // It retuns a SInfo struct.
134 func Extract_Syntax(contents string, syntax string) (*SInfo, error) {
136 switch syntax {
137 case "latex": return Extract_Latex(contents)
138 default: return nil, errors.New("[PropRef:extract_syntactic] Not suported syntax")