vfs: check userland buffers before reading them.
[haiku.git] / src / add-ons / kernel / file_systems / googlefs / parse_google_html.c
blob345238f80d323a98e9d33488d24be40a6a638797
1 /*
2 * Copyright 2004-2008, François Revol, <revol@free.fr>.
3 * Distributed under the terms of the MIT License.
4 */
6 #include <errno.h>
7 #include <sys/param.h>
8 #include <stdio.h>
9 #include <stdlib.h>
10 #include <string.h>
11 #include <unistd.h>
12 #include <OS.h>
13 #include "google_request.h"
14 #include "string_utils.h"
16 #define TESTME
18 #ifdef _KERNEL_MODE
19 #define printf dprintf
20 #undef TESTME
21 #endif
23 #define DBG "googlefs: parse_html: "
25 #ifdef TESTME
26 #define BUFSZ (128*1024)
27 int dbgstep = 0;
28 #define PRST printf(DBG "step %d\n", dbgstep++)
29 #else
30 #define PRST {}
31 #endif
33 //old
34 //#define G_BEGIN_URL "<p class=g><a class=l href=\""
35 //#define G_BEGIN_URL "<div class=g><a class=l href=\""
36 //#define G_BEGIN_URL "<div class=g><a href=\""
37 #define G_BEGIN_URL "<h3 class=\"r\"><a href=\""
38 //#define G_END_URL "\">"
39 #define G_END_URL "\">"
40 //#define G_BEGIN_NAME
41 #define G_END_NAME "</a>"
42 #define G_BEGIN_SNIPSET /*"<td class=j>"*/"<font size=-1>"
43 #define G_END_SNIPSET "<br>"
44 #define G_BEGIN_CACHESIM " <a class=fl href=\""
45 #define G_END_CACHESIM "\">"
46 #define G_URL_PREFIX "http://www.google.com"
48 int google_parse_results(const char *html, size_t htmlsize, long *nextid, struct google_result **results)
50 struct google_result *res = NULL, *nres = NULL, *prev = NULL;
51 char *p, *q;
52 char *nextresult = NULL;
53 long numres = 0;
54 long maxres = 1000;
55 //long startid = 0;
56 int done = 0;
57 int err = ENOMEM;
59 if (!html || !results)
60 return EINVAL;
61 /* sanity checks */
62 printf(DBG"sanity check...\n");
63 PRST;
64 /* google now sends <!doctype html><head> sometimes... */
65 if (strstr(html, "<!doctype html><head>") != html) {
66 if (strstr(html, "<html><head>") != html) {
67 if (strstr(html, "<!doctype html><html ") != html)
68 return EINVAL;
71 PRST;
72 // p = strstr(html, "<title>Google Search:");
73 p = strstr(html, "Google");
74 if (!p) return EINVAL;
75 PRST;
76 p = strstr(html, "<body");
77 if (!p) return EINVAL;
78 PRST;
81 p = strstr(html, "Search Results<");
82 if (!p) return EINVAL;
83 PRST;
87 printf(DBG"parsing...\n");
88 do {
89 char *item;
90 long itemlen;
91 char *tmp;
92 char *urlp;
93 int i;
94 #ifdef TESTME
95 dbgstep = 0;
96 #endif
97 nres = malloc(sizeof(struct google_result));
98 if (!nres) {
99 // XXX: cleanup!
100 goto err0;
102 memset(nres, 0, sizeof(struct google_result));
103 nres->id = (*nextid)++; //- 1;
105 PRST;
106 /* find url */
107 // <p class=g><a href=URL>
108 if (!p) break;
109 if (nextresult)
110 p = nextresult;
111 else
112 p = strstr(p, G_BEGIN_URL);
113 if (!p) break;
114 PRST;
115 p+= strlen(G_BEGIN_URL);
116 nextresult = strstr(p, G_BEGIN_URL);
117 //printf(DBG"[%ld] found token 1\n", numres);
118 item = p;
119 p = strstr(p, G_END_URL);
120 if (!p) break;
121 PRST;
122 p+= strlen(G_END_URL);
123 //printf(DBG"[%ld] found token 2\n", numres);
124 itemlen = GR_MAX_URL-1;
125 urlp = nres->url;
126 if (!strncmp(item, "/url?", 5)) {
127 strcpy(urlp, G_URL_PREFIX);
128 itemlen -= strlen(G_URL_PREFIX);
129 urlp += strlen(G_URL_PREFIX);
130 printf("plop\n");
132 itemlen = MIN(itemlen, p - item - strlen(G_END_URL));
133 strncpy(urlp, item, itemlen);
134 urlp[itemlen] = '\0';
136 /* find name */
137 //<b>Google</b> Web APIs - FAQ</a><table
138 item = p;
139 p = strstr(p, G_END_NAME);
140 if (!p) break;
141 PRST;
142 p+= strlen(G_END_NAME);
143 //printf(DBG"[%ld] found token 3\n", numres);
144 itemlen = p - item - strlen(G_END_NAME);
145 //itemlen = MIN(GR_MAX_NAME-1, itemlen);
146 itemlen = MIN(GR_MAX_NAME*4-1, itemlen);
147 q = malloc(itemlen+1);
148 if (!q)
149 goto err0;
150 strncpy(q, item, itemlen);
151 q[itemlen] = '\0';
152 /* strip <*b> off */
153 PRST;
154 while ((tmp = strstr(q, "<b>")))
155 strcpy(tmp, tmp + 3);
156 while ((tmp = strstr(q, "</b>")))
157 strcpy(tmp, tmp + 4);
158 /* strip <*em> off */
159 PRST;
160 while ((tmp = strstr(q, "<em>")))
161 strcpy(tmp, tmp + 4);
162 while ((tmp = strstr(q, "</em>")))
163 strcpy(tmp, tmp + 5);
164 /* strip &foo; */
165 tmp = unentitify_string(q);
166 free(q);
167 if (!tmp)
168 goto err0;
169 strncpy(nres->name, tmp, GR_MAX_NAME-1);
170 nres->name[GR_MAX_NAME-1] = '\0';
171 free(tmp);
172 PRST;
174 #if 0
175 /* find snipset */
176 //<td class=j><font size=-1><b>...</b> a custom Java client library, documentation on <b>how</b> <b>to</b> use the <b>...</b> You can find it at http://<b>api</b>.<b>google</b>.com/GoogleSearch.wsdl <b>...</b> need to get started is in <b>googleapi</b>.jar <b>...</b> <br>
177 if (!p) break;
178 q = strstr(p, G_BEGIN_SNIPSET);
179 if (q && (!nextresult || (q < nextresult))) {
180 p = q;
181 p+= strlen(G_BEGIN_SNIPSET);
182 //printf(DBG"[%ld] found token 4\n", numres);
183 item = p;
184 p = strstr(p, G_END_SNIPSET);
185 if (!p) break;
186 p+= strlen(G_END_SNIPSET);
187 //printf(DBG"[%ld] found token 5\n", numres);
188 itemlen = p - item - strlen(G_END_SNIPSET);
189 itemlen = MIN(GR_MAX_URL-1, itemlen);
190 strncpy(nres->snipset, item, itemlen);
191 nres->snipset[itemlen] = '\0';
192 /* strip &foo; */
193 tmp = unentitify_string(nres->snipset);
194 if (!tmp)
195 break;
196 strncpy(nres->snipset, tmp, GR_MAX_SNIPSET-1);
197 nres->snipset[GR_MAX_SNIPSET-1] = '\0';
198 free(tmp);
199 /* strip <*b> off */
200 while ((tmp = strstr(nres->snipset, "<b>")))
201 strcpy(tmp, tmp + 3);
202 while ((tmp = strstr(nres->snipset, "</b>")))
203 strcpy(tmp, tmp + 4);
204 while ((tmp = strstr(nres->snipset, "\r")))
205 strcpy(tmp, tmp + 1);
206 while ((tmp = strstr(nres->snipset, "\n")))
207 *tmp = ' ';
210 #endif
211 /* find cache/similar url */
212 // <a class=fl href="http://216.239.59.104/search?q=cache:vR7BaPWutnkJ:www.google.com/apis/api_faq.html+google+api++help+%22frequently+asked%22+-plop&hl=en&lr=lang_en&ie=UTF-8">Cached</a>
213 for (i = 0; i < 2; i++) {
214 if (!p) break;
215 q = strstr(p, G_BEGIN_CACHESIM);
216 if (q && nextresult && (q > nextresult)) {
217 p = q;
218 printf(DBG"[%ld] cache/sim beyond next\n", numres);
219 p = nextresult; /* reset */
220 } else if (q && (!nextresult || (q < nextresult))) {
221 int iscache;
222 p = q;
223 p+= strlen(G_BEGIN_CACHESIM);
224 //printf(DBG"[%ld] found token 6\n", numres);
225 item = p;
226 p = strstr(p, G_END_CACHESIM);
227 if (!p) break;
228 p+= strlen(G_END_CACHESIM);
229 //printf(DBG"[%ld] found token 7\n", numres);
230 itemlen = p - item - strlen(G_END_CACHESIM);
231 itemlen = MIN(GR_MAX_URL-1, itemlen);
232 if (!strncmp(p, "Cached", 6)) {
233 strncpy(nres->cache_url, item, itemlen);
234 nres->cache_url[itemlen] = '\0';
235 } else if (!strncmp(p, "Similar", 7)) {
236 strncpy(nres->similar_url, item, itemlen);
237 nres->similar_url[itemlen] = '\0';
239 // else
240 // break;
244 numres++;
245 if (!prev)
246 res = nres;
247 else
248 prev->next = nres;
249 prev = nres;
250 nres = NULL;
251 } while (!done || numres < maxres);
252 *results = res;
253 return numres;
254 err0:
255 free(nres);
256 while (res) {
257 nres = res->next;
258 free(res);
259 res = nres;
261 return err;
264 #ifdef TESTME
265 int main(int argc, char **argv)
267 struct google_result *results;
268 struct google_result *tag1 = 0xaaaa5555, *res = NULL, *tag2 = 0x5555aaaa;
269 size_t len;
270 char *p;
271 int err;
272 long nextid = 0;
274 p = malloc(BUFSZ+8);
275 len = read(0, p+4, BUFSZ);
276 p[BUFSZ+4-1] = '\0';
277 *(uint32 *)p = 0xa5a5a5a5;
278 *(uint32 *)(&p[BUFSZ+4]) = 0x5a5a5a5a;
279 err = google_parse_results(p+4, len, &nextid, &results);
280 printf("error 0x%08lx\n", err);
281 if (err < 0)
282 return 1;
283 res = results;
284 while (res) {
285 printf("[%ld]:\nURL='%s'\nNAME='%s'\nSNIPSET='%s'\nCACHE='%s'\nSIMILAR='%s'\n\n", res->id, res->url, res->name, res->snipset, res->cache_url, res->similar_url);
286 res = res->next;
288 printf("before = 0x%08lx:0x%08lx, after = 0x%08lx:0x%08lx\n", 0xa5a5a5a5, *(uint32 *)p, 0x5a5a5a5a, *(uint32 *)(&p[BUFSZ+4]));
289 printf("before = 0x%08lx:0x%08lx, after = 0x%08lx:0x%08lx\n", 0xaaaa5555, tag1, 0x5555aaaa, tag2);
290 return 0;
292 #endif