1 // Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #ifndef ENCODINGS_COMPACT_LANG_DET_GETONESCRIPTSPAN_H_
6 #define ENCODINGS_COMPACT_LANG_DET_GETONESCRIPTSPAN_H_
8 #include "encodings/compact_lang_det/letterscript_enum.h"
9 #include "encodings/compact_lang_det/compact_lang_det_impl.h"
12 static const int kMaxScriptBuffer
= 4096;
13 static const int kMaxScriptLowerBuffer
= (kMaxScriptBuffer
* 3) / 2;
14 static const int kMaxScriptBytes
= kMaxScriptBuffer
- 8; // Leave some room
15 static const int kMaxAnswerBuffer
= 256;
17 typedef enum UnicodeLScript ULScript
;
20 char* text
; // Pointer to the span, somewhere
21 int text_bytes
; // Number of bytes of text in the span
22 int offset
; // Offset of start of span in original input buffer
23 ULScript script
; // Script of all the letters in this span
24 Language lang
; // Language identified for this span
25 bool truncated
; // true if buffer filled up before a
26 // different script or EOF was found
30 static inline bool IsContinuationByte(char c
) {
31 return static_cast<signed char>(c
) < -64;
34 // Gets lscript number for letters; always returns
35 // 0 (common script) for non-letters
36 int GetUTF8LetterScriptNum(const char* src
);
39 // Update src pointer to point to next quadgram, +2..+5
41 const char* AdvanceQuad(const char* src
);
42 } // end namespace getone
51 ScriptScanner(const char* buffer
, int buffer_length
, bool is_plain_text
);
54 // Copy next run of same-script non-tag letters to buffer [NUL terminated]
55 bool GetOneScriptSpan(getone::LangSpan
* span
);
57 // Force Latin and Cyrillic scripts to be lowercase
58 void LowerScriptSpan(getone::LangSpan
* span
);
60 // Copy next run of same-script non-tag letters to buffer [NUL terminated]
61 // Force Latin and Cyrillic scripts to be lowercase
62 bool GetOneScriptSpanLower(getone::LangSpan
* span
);
65 int SkipToFrontOfSpan(const char* src
, int len
, int* script
);
67 const char* start_byte_
;
68 const char* next_byte_
;
69 const char* next_byte_limit_
;
72 char* script_buffer_
; // Holds text with expanded entities
73 char* script_buffer_lower_
; // Holds lowercased text
79 LangScanner(const CompactLangDetImpl::LangDetObj
* langdetobj
,
80 getone::LangSpan
* spn
, int smoothwidth
, int smoothcandidates
,
81 int maxlangs
, int minlangspan
);
85 int script() {return script_
;}
88 // Keep smoothing state if same script, otherwise reinit smoothing
89 void NewText(getone::LangSpan
* spn
);
91 bool GetOneShortLangSpanBoot(getone::LangSpan
* span
); // Just for bootstrapping
92 bool GetOneLangSpanBoot(getone::LangSpan
* span
); // Just for bootstrapping
95 bool GetOneShortLangSpan(const CompactLangDetImpl::LangDetObj
* langdetobj
,
96 getone::LangSpan
* span
);
97 bool GetOneLangSpan(const CompactLangDetImpl::LangDetObj
* langdetobj
,
98 getone::LangSpan
* span
);
100 // Increases language bias by delta
101 void SetLanguageBias(const CompactLangDetImpl::LangDetObj
* langdetobj
,
102 Language key
, int delta
);
104 // For debugging output
106 char answer_buffer_
[getone::kMaxAnswerBuffer
];
107 char answer_buffer2_
[getone::kMaxAnswerBuffer
];
108 char answer_buffer3_
[getone::kMaxAnswerBuffer
];
109 char answer_buffer4_
[getone::kMaxAnswerBuffer
];
112 const char* start_byte_
;
113 const char* next_byte_limit_
;
114 const char* next_byte_
;
115 const char* onelangspan_begin_
;
121 int smoothcandidates_
;
131 #endif // ENCODINGS_COMPACT_LANG_DET_GETONESCRIPTSPAN_H_