Roll src/third_party/WebKit 06cb9e9:a978ee5 (svn 202558:202559)
[chromium-blink-merge.git] / third_party / cld / encodings / compact_lang_det / getonescriptspan.h
blob936aab4faa65a4d4c4761306e63267103eb530d2
1 // Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #ifndef ENCODINGS_COMPACT_LANG_DET_GETONESCRIPTSPAN_H_
6 #define ENCODINGS_COMPACT_LANG_DET_GETONESCRIPTSPAN_H_
8 #include "encodings/compact_lang_det/letterscript_enum.h"
9 #include "encodings/compact_lang_det/compact_lang_det_impl.h"
11 namespace getone {
12 static const int kMaxScriptBuffer = 4096;
13 static const int kMaxScriptLowerBuffer = (kMaxScriptBuffer * 3) / 2;
14 static const int kMaxScriptBytes = kMaxScriptBuffer- 8; // Leave some room
15 static const int kMaxAnswerBuffer = 256;
17 typedef enum UnicodeLScript ULScript;
19 typedef struct {
20 char* text; // Pointer to the span, somewhere
21 int text_bytes; // Number of bytes of text in the span
22 int offset; // Offset of start of span in original input buffer
23 ULScript script; // Script of all the letters in this span
24 Language lang; // Language identified for this span
25 bool truncated; // true if buffer filled up before a
26 // different script or EOF was found
27 } LangSpan;
30 static inline bool IsContinuationByte(char c) {
31 return static_cast<signed char>(c) < -64;
34 // Gets lscript number for letters; always returns
35 // 0 (common script) for non-letters
36 int GetUTF8LetterScriptNum(const char* src);
39 // Update src pointer to point to next quadgram, +2..+5
40 // Looks at src[0..4]
41 const char* AdvanceQuad(const char* src);
42 } // end namespace getone
49 class ScriptScanner {
50 public:
51 ScriptScanner(const char* buffer, int buffer_length, bool is_plain_text);
52 ~ScriptScanner();
54 // Copy next run of same-script non-tag letters to buffer [NUL terminated]
55 bool GetOneScriptSpan(getone::LangSpan* span);
57 // Force Latin and Cyrillic scripts to be lowercase
58 void LowerScriptSpan(getone::LangSpan* span);
60 // Copy next run of same-script non-tag letters to buffer [NUL terminated]
61 // Force Latin and Cyrillic scripts to be lowercase
62 bool GetOneScriptSpanLower(getone::LangSpan* span);
64 private:
65 int SkipToFrontOfSpan(const char* src, int len, int* script);
67 const char* start_byte_;
68 const char* next_byte_;
69 const char* next_byte_limit_;
70 int byte_length_;
71 bool is_plain_text_;
72 char* script_buffer_; // Holds text with expanded entities
73 char* script_buffer_lower_; // Holds lowercased text
77 class LangScanner {
78 public:
79 LangScanner(const CompactLangDetImpl::LangDetObj* langdetobj,
80 getone::LangSpan* spn, int smoothwidth, int smoothcandidates,
81 int maxlangs, int minlangspan);
82 ~LangScanner();
85 int script() {return script_;}
87 // Use new text
88 // Keep smoothing state if same script, otherwise reinit smoothing
89 void NewText(getone::LangSpan* spn);
91 bool GetOneShortLangSpanBoot(getone::LangSpan* span); // Just for bootstrapping
92 bool GetOneLangSpanBoot(getone::LangSpan* span); // Just for bootstrapping
94 // The real ones
95 bool GetOneShortLangSpan(const CompactLangDetImpl::LangDetObj* langdetobj,
96 getone::LangSpan* span);
97 bool GetOneLangSpan(const CompactLangDetImpl::LangDetObj* langdetobj,
98 getone::LangSpan* span);
100 // Increases language bias by delta
101 void SetLanguageBias(const CompactLangDetImpl::LangDetObj* langdetobj,
102 Language key, int delta);
104 // For debugging output
105 int next_answer_;
106 char answer_buffer_[getone::kMaxAnswerBuffer];
107 char answer_buffer2_[getone::kMaxAnswerBuffer];
108 char answer_buffer3_[getone::kMaxAnswerBuffer];
109 char answer_buffer4_[getone::kMaxAnswerBuffer];
111 private:
112 const char* start_byte_;
113 const char* next_byte_limit_;
114 const char* next_byte_;
115 const char* onelangspan_begin_;
116 int byte_length_;
117 int script_;
118 Language spanlang_;
119 int smoothwidth_;
120 int smoothwidth_2_;
121 int smoothcandidates_;
122 int maxlangs_;
123 int minlangspan_;
124 int rb_size_;
125 int next_rb_;
126 int rb_mask_;
127 uint32* rb_;
128 int* offset_rb_;
131 #endif // ENCODINGS_COMPACT_LANG_DET_GETONESCRIPTSPAN_H_