2 * Copyright 2004-2008, François Revol, <revol@free.fr>.
3 * Distributed under the terms of the MIT License.
13 #include "google_request.h"
14 #include "string_utils.h"
19 #define printf dprintf
23 #define DBG "googlefs: parse_html: "
26 #define BUFSZ (128*1024)
28 #define PRST printf(DBG "step %d\n", dbgstep++)
34 //#define G_BEGIN_URL "<p class=g><a class=l href=\""
35 //#define G_BEGIN_URL "<div class=g><a class=l href=\""
36 //#define G_BEGIN_URL "<div class=g><a href=\""
37 #define G_BEGIN_URL "<h3 class=\"r\"><a href=\""
38 //#define G_END_URL "\">"
39 #define G_END_URL "\">"
40 //#define G_BEGIN_NAME
41 #define G_END_NAME "</a>"
42 #define G_BEGIN_SNIPSET /*"<td class=j>"*/"<font size=-1>"
43 #define G_END_SNIPSET "<br>"
44 #define G_BEGIN_CACHESIM " <a class=fl href=\""
45 #define G_END_CACHESIM "\">"
46 #define G_URL_PREFIX "http://www.google.com"
48 int google_parse_results(const char *html
, size_t htmlsize
, long *nextid
, struct google_result
**results
)
50 struct google_result
*res
= NULL
, *nres
= NULL
, *prev
= NULL
;
52 char *nextresult
= NULL
;
59 if (!html
|| !results
)
62 printf(DBG
"sanity check...\n");
64 /* google now sends <!doctype html><head> sometimes... */
65 if (strstr(html
, "<!doctype html><head>") != html
) {
66 if (strstr(html
, "<html><head>") != html
) {
67 if (strstr(html
, "<!doctype html><html ") != html
)
72 // p = strstr(html, "<title>Google Search:");
73 p
= strstr(html
, "Google");
74 if (!p
) return EINVAL
;
76 p
= strstr(html
, "<body");
77 if (!p
) return EINVAL
;
81 p = strstr(html, "Search Results<");
82 if (!p) return EINVAL;
87 printf(DBG
"parsing...\n");
97 nres
= malloc(sizeof(struct google_result
));
102 memset(nres
, 0, sizeof(struct google_result
));
103 nres
->id
= (*nextid
)++; //- 1;
107 // <p class=g><a href=URL>
112 p
= strstr(p
, G_BEGIN_URL
);
115 p
+= strlen(G_BEGIN_URL
);
116 nextresult
= strstr(p
, G_BEGIN_URL
);
117 //printf(DBG"[%ld] found token 1\n", numres);
119 p
= strstr(p
, G_END_URL
);
122 p
+= strlen(G_END_URL
);
123 //printf(DBG"[%ld] found token 2\n", numres);
124 itemlen
= GR_MAX_URL
-1;
126 if (!strncmp(item
, "/url?", 5)) {
127 strcpy(urlp
, G_URL_PREFIX
);
128 itemlen
-= strlen(G_URL_PREFIX
);
129 urlp
+= strlen(G_URL_PREFIX
);
132 itemlen
= MIN(itemlen
, p
- item
- strlen(G_END_URL
));
133 strncpy(urlp
, item
, itemlen
);
134 urlp
[itemlen
] = '\0';
137 //<b>Google</b> Web APIs - FAQ</a><table
139 p
= strstr(p
, G_END_NAME
);
142 p
+= strlen(G_END_NAME
);
143 //printf(DBG"[%ld] found token 3\n", numres);
144 itemlen
= p
- item
- strlen(G_END_NAME
);
145 //itemlen = MIN(GR_MAX_NAME-1, itemlen);
146 itemlen
= MIN(GR_MAX_NAME
*4-1, itemlen
);
147 q
= malloc(itemlen
+1);
150 strncpy(q
, item
, itemlen
);
154 while ((tmp
= strstr(q
, "<b>")))
155 strcpy(tmp
, tmp
+ 3);
156 while ((tmp
= strstr(q
, "</b>")))
157 strcpy(tmp
, tmp
+ 4);
158 /* strip <*em> off */
160 while ((tmp
= strstr(q
, "<em>")))
161 strcpy(tmp
, tmp
+ 4);
162 while ((tmp
= strstr(q
, "</em>")))
163 strcpy(tmp
, tmp
+ 5);
165 tmp
= unentitify_string(q
);
169 strncpy(nres
->name
, tmp
, GR_MAX_NAME
-1);
170 nres
->name
[GR_MAX_NAME
-1] = '\0';
176 //<td class=j><font size=-1><b>...</b> a custom Java client library, documentation on <b>how</b> <b>to</b> use the <b>...</b> You can find it at http://<b>api</b>.<b>google</b>.com/GoogleSearch.wsdl <b>...</b> need to get started is in <b>googleapi</b>.jar <b>...</b> <br>
178 q
= strstr(p
, G_BEGIN_SNIPSET
);
179 if (q
&& (!nextresult
|| (q
< nextresult
))) {
181 p
+= strlen(G_BEGIN_SNIPSET
);
182 //printf(DBG"[%ld] found token 4\n", numres);
184 p
= strstr(p
, G_END_SNIPSET
);
186 p
+= strlen(G_END_SNIPSET
);
187 //printf(DBG"[%ld] found token 5\n", numres);
188 itemlen
= p
- item
- strlen(G_END_SNIPSET
);
189 itemlen
= MIN(GR_MAX_URL
-1, itemlen
);
190 strncpy(nres
->snipset
, item
, itemlen
);
191 nres
->snipset
[itemlen
] = '\0';
193 tmp
= unentitify_string(nres
->snipset
);
196 strncpy(nres
->snipset
, tmp
, GR_MAX_SNIPSET
-1);
197 nres
->snipset
[GR_MAX_SNIPSET
-1] = '\0';
200 while ((tmp
= strstr(nres
->snipset
, "<b>")))
201 strcpy(tmp
, tmp
+ 3);
202 while ((tmp
= strstr(nres
->snipset
, "</b>")))
203 strcpy(tmp
, tmp
+ 4);
204 while ((tmp
= strstr(nres
->snipset
, "\r")))
205 strcpy(tmp
, tmp
+ 1);
206 while ((tmp
= strstr(nres
->snipset
, "\n")))
211 /* find cache/similar url */
212 // <a class=fl href="http://216.239.59.104/search?q=cache:vR7BaPWutnkJ:www.google.com/apis/api_faq.html+google+api++help+%22frequently+asked%22+-plop&hl=en&lr=lang_en&ie=UTF-8">Cached</a>
213 for (i
= 0; i
< 2; i
++) {
215 q
= strstr(p
, G_BEGIN_CACHESIM
);
216 if (q
&& nextresult
&& (q
> nextresult
)) {
218 printf(DBG
"[%ld] cache/sim beyond next\n", numres
);
219 p
= nextresult
; /* reset */
220 } else if (q
&& (!nextresult
|| (q
< nextresult
))) {
223 p
+= strlen(G_BEGIN_CACHESIM
);
224 //printf(DBG"[%ld] found token 6\n", numres);
226 p
= strstr(p
, G_END_CACHESIM
);
228 p
+= strlen(G_END_CACHESIM
);
229 //printf(DBG"[%ld] found token 7\n", numres);
230 itemlen
= p
- item
- strlen(G_END_CACHESIM
);
231 itemlen
= MIN(GR_MAX_URL
-1, itemlen
);
232 if (!strncmp(p
, "Cached", 6)) {
233 strncpy(nres
->cache_url
, item
, itemlen
);
234 nres
->cache_url
[itemlen
] = '\0';
235 } else if (!strncmp(p
, "Similar", 7)) {
236 strncpy(nres
->similar_url
, item
, itemlen
);
237 nres
->similar_url
[itemlen
] = '\0';
251 } while (!done
|| numres
< maxres
);
265 int main(int argc
, char **argv
)
267 struct google_result
*results
;
268 struct google_result
*tag1
= 0xaaaa5555, *res
= NULL
, *tag2
= 0x5555aaaa;
275 len
= read(0, p
+4, BUFSZ
);
277 *(uint32
*)p
= 0xa5a5a5a5;
278 *(uint32
*)(&p
[BUFSZ
+4]) = 0x5a5a5a5a;
279 err
= google_parse_results(p
+4, len
, &nextid
, &results
);
280 printf("error 0x%08lx\n", err
);
285 printf("[%ld]:\nURL='%s'\nNAME='%s'\nSNIPSET='%s'\nCACHE='%s'\nSIMILAR='%s'\n\n", res
->id
, res
->url
, res
->name
, res
->snipset
, res
->cache_url
, res
->similar_url
);
288 printf("before = 0x%08lx:0x%08lx, after = 0x%08lx:0x%08lx\n", 0xa5a5a5a5, *(uint32
*)p
, 0x5a5a5a5a, *(uint32
*)(&p
[BUFSZ
+4]));
289 printf("before = 0x%08lx:0x%08lx, after = 0x%08lx:0x%08lx\n", 0xaaaa5555, tag1
, 0x5555aaaa, tag2
);