2 # Copyright (c) 2010 Google Inc. All rights reserved.
4 # Redistribution and use in source and binary forms, with or without
5 # modification, are permitted provided that the following conditions are
8 # * Redistributions of source code must retain the above copyright
9 # notice, this list of conditions and the following disclaimer.
10 # * Redistributions in binary form must reproduce the above
11 # copyright notice, this list of conditions and the following disclaimer
12 # in the documentation and/or other materials provided with the
14 # * Neither the name of Google Inc. nor the names of its
15 # contributors may be used to endorse or promote products derived from
16 # this software without specific prior written permission.
18 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 """This python script creates the raw data that is our entity
31 database. The representation is one string database containing all
32 strings we could need, and then a mapping from offset+length -> entity
33 data. That is compact, easy to use and efficient."""
43 def convert_value_to_int(value
):
46 assert(value
[0] == "U")
47 assert(value
[1] == "+")
48 return "0x" + value
[2:]
51 def offset_table_entry(offset
):
52 return " &staticEntityTable[%s]," % offset
55 program_name
= os
.path
.basename(__file__
)
56 if len(sys
.argv
) < 4 or sys
.argv
[1] != "-o":
57 # Python 3, change to: print("Usage: %s -o OUTPUT_FILE INPUT_FILE" % program_name, file=sys.stderr)
58 sys
.stderr
.write("Usage: %s -o OUTPUT_FILE INPUT_FILE\n" % program_name
)
61 output_path
= sys
.argv
[2]
62 input_path
= sys
.argv
[3]
64 with
open(input_path
) as html_entity_names_file
:
65 entries
= list(csv
.reader(html_entity_names_file
))
67 entries
.sort(key
= lambda entry
: entry
[ENTITY
])
68 entity_count
= len(entries
)
70 output_file
= open(output_path
, "w")
72 output_file
.write("""/*
73 * Copyright (C) 2010 Google, Inc. All Rights Reserved.
75 * Redistribution and use in source and binary forms, with or without
76 * modification, are permitted provided that the following conditions
78 * 1. Redistributions of source code must retain the above copyright
79 * notice, this list of conditions and the following disclaimer.
80 * 2. Redistributions in binary form must reproduce the above copyright
81 * notice, this list of conditions and the following disclaimer in the
82 * documentation and/or other materials provided with the distribution.
84 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
85 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
86 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
87 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
88 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
89 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
90 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
91 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
92 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
93 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
94 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
97 // THIS FILE IS GENERATED BY core/html/parser/create-html-entity-table
98 // DO NOT EDIT (unless you are a ninja)!
101 #include "core/html/parser/HTMLEntityTable.h"
108 assert len(entries
) > 0, "Code assumes a non-empty entity array."
109 def check_ascii(entity_string
):
110 for c
in entity_string
:
112 assert 0 <= code
<= 127, (c
+ " is not ASCII. Need to change type " +
113 "of storage from LChar to UChar to support " +
116 output_file
.write("static const LChar staticEntityStringStorage[] = {\n")
117 output_file
.write("'")
122 for entry
in entries
:
123 check_ascii(entry
[ENTITY
])
124 # Reuse substrings from earlier entries. This saves 1-2000
125 # characters, but it's O(n^2) and not very smart. The optimal
126 # solution has to solve the "Shortest Common Superstring" problem
127 # and that is NP-Complete or worse.
129 # This would be even more efficient if we didn't store the
130 # semi-colon in the array but as a bit in the entry.
131 entity
= entry
[ENTITY
]
132 already_existing_offset
= all_data
.find(entity
)
133 if already_existing_offset
!= -1:
135 this_offset
= already_existing_offset
136 saved_by_reusing
+= len(entity
)
139 output_file
.write(",\n'")
142 # Try the end of the string and see if we can reuse that to
143 # fit the start of the new entity.
145 this_offset
= entity_offset
146 for truncated_len
in range(len(entity
) - 1, 0, -1):
147 if all_data
.endswith(entity
[:truncated_len
]):
148 data_to_add
= entity
[truncated_len
:]
149 this_offset
= entity_offset
- truncated_len
150 saved_by_reusing
+= truncated_len
153 output_file
.write("', '".join(data_to_add
))
154 all_data
+= data_to_add
155 output_file
.write("'")
156 entity_offset
+= len(data_to_add
)
157 assert len(entry
) == 2, "We will use slot [2] in the list for the offset."
158 assert this_offset
< 32768 # Stored in a 16 bit short.
159 entry
.append(this_offset
)
161 output_file
.write("};\n")
164 for offset
, entry
in enumerate(entries
):
165 starting_letter
= entry
[ENTITY
][0]
166 if starting_letter
not in index
:
167 index
[starting_letter
] = offset
169 output_file
.write("""
170 static const HTMLEntityTableEntry staticEntityTable[%s] = {\n""" % entity_count
)
172 for entry
in entries
:
173 values
= entry
[VALUE
].split(' ')
174 assert len(values
) <= 2, values
175 output_file
.write(' { %s, %s, %s, %s }, // &%s\n' % (
176 convert_value_to_int(values
[0]),
177 convert_value_to_int(values
[1] if len(values
) >= 2 else ""),
183 output_file
.write("""};
187 output_file
.write("""
191 output_file
.write("static const short uppercaseOffset[] = {\n")
192 for letter
in string
.ascii_uppercase
:
193 output_file
.write("%d,\n" % index
[letter
])
194 output_file
.write("%d\n" % index
['a'])
195 output_file
.write("""};
197 static const short lowercaseOffset[] = {\n""")
198 for letter
in string
.ascii_lowercase
:
199 output_file
.write("%d,\n" % index
[letter
])
200 output_file
.write("%d\n" % entity_count
)
201 output_file
.write("""};
203 const LChar* HTMLEntityTable::entityString(const HTMLEntityTableEntry& entry)
205 return staticEntityStringStorage + entry.entityOffset;
208 LChar HTMLEntityTableEntry::lastCharacter() const
210 return HTMLEntityTable::entityString(*this)[length - 1];
213 const HTMLEntityTableEntry* HTMLEntityTable::firstEntryStartingWith(UChar c)
215 if (c >= 'A' && c <= 'Z')
216 return &staticEntityTable[uppercaseOffset[c - 'A']];
217 if (c >= 'a' && c <= 'z')
218 return &staticEntityTable[lowercaseOffset[c - 'a']];
222 const HTMLEntityTableEntry* HTMLEntityTable::lastEntryStartingWith(UChar c)
224 if (c >= 'A' && c <= 'Z')
225 return &staticEntityTable[uppercaseOffset[c - 'A' + 1]] - 1;
226 if (c >= 'a' && c <= 'z')
227 return &staticEntityTable[lowercaseOffset[c - 'a' + 1]] - 1;
231 const HTMLEntityTableEntry* HTMLEntityTable::firstEntry()
233 return &staticEntityTable[0];
236 const HTMLEntityTableEntry* HTMLEntityTable::lastEntry()
238 return &staticEntityTable[%s - 1];