Add mime-types for similar-page regexs to upstream-server proxy config
[httpd-crcsyncproxy.git] / crccache / mod_crccache_client_find_similar.c
blob6e08fc2b3b740c4e8568ccd821561876995b5685
1 /* Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
16 * Find a page for a similar URL as the newly requested page
17 * Created on: 02/08/2010
18 * Author: Alex Wulms
21 #include <apr.h>
23 #include <apr_strings.h>
24 #include <apr_lib.h>
26 #if APR_HAVE_UNISTD_H
27 /* for getpid() */
28 #include <unistd.h>
29 #endif
32 #include <httpd.h>
33 #include <http_log.h>
36 #ifdef AP_NEED_SET_MUTEX_PERMS
37 #include "unixd.h"
38 #endif
40 #include "crccache.h"
41 #include "mod_crccache_client_find_similar.h"
42 #include "ap_log_helper.h"
43 #include "rmm_hash.h"
45 RMM_OFF_T_DECLARE(char);
47 typedef struct vary_headers_s vary_headers_t;
48 RMM_OFF_T_DECLARE(vary_headers_t);
49 struct vary_headers_s {
50 RMM_OFF_T(vary_headers_t) next;
51 RMM_OFF_T(char) name;
52 RMM_OFF_T(char) value;
56 typedef struct cached_files_info_s cached_files_info_t;
57 RMM_OFF_T_DECLARE(cached_files_info_t);
58 struct cached_files_info_s {
59 RMM_OFF_T(cached_files_info_t) prev;
60 RMM_OFF_T(cached_files_info_t) next;
61 RMM_OFF_T(char) basepath; // Path without .header or .data postfix
62 RMM_OFF_T(char) uri; // URI of the page (useful for logging purposes)
63 RMM_OFF_T(vary_headers_t) vary_headers;
66 typedef struct sp_per_content_type_s sp_per_content_type_t;
67 RMM_OFF_T_DECLARE(sp_per_content_type_t);
68 struct sp_per_content_type_s {
69 RMM_OFF_T(sp_per_content_type_t) next;
70 RMM_OFF_T(char) content_type;
71 RMM_OFF_T(cached_files_info_t) cached_files_info;
72 RMM_OFF_T(rmm_hash_t) cached_files_info_by_path;
73 RMM_OFF_T(cached_files_info_t) tail_file_info;
76 typedef struct sp_per_regex_s sp_per_regex_t;
77 RMM_OFF_T_DECLARE(sp_per_regex_t);
78 struct sp_per_regex_s {
79 RMM_OFF_T(sp_per_regex_t) next;
80 /* The regex parameter stored here is the non-compiled regex string.
81 * The compiled version must be cached in a per-process cache pool.
82 * Reason is that the ap_regex compiler allocates an internal structure
83 * for the compiled data using malloc. The ap_preg structure does not provide
84 * any info about that internal structure (like the length) and as such,
85 * the internal structure can not be transferred to the shared memory :-(
87 RMM_OFF_T(char) regex;
88 apr_size_t regex_len;
89 RMM_OFF_T(sp_per_content_type_t) similar_pages_per_content_type;
92 RMM_OFF_T_DECLARE(int);
93 struct similar_page_cache_s {
94 const char* cache_root;
95 apr_size_t cache_root_len;
97 apr_global_mutex_t *fs_cache_lock;
98 apr_size_t cache_bytes; /* Size (in bytes) of shared memory cache */
99 #if APR_HAS_SHARED_MEMORY
100 apr_shm_t *shm;
101 #endif
102 apr_rmm_t *rmm;
103 RMM_OFF_T(rmm_hash_t) similar_pages_per_host;
104 const char *cache_file; /* filename for shm backing cache file */
105 const char *lock_file; /* filename for shm lock mutex */
106 RMM_OFF_T(int) lock_is_available; /* lock is available in all threads/subprocesses */
107 apr_hash_t *similar_pages_regexs; /* compiled regular expressions for similar pages */
108 RMM_OFF_T(rmm_hash_t) vary_headers_cache;
109 int similar_pages_cache_initialized;
113 * Returns 1 when the lock is available in all threads/subprocesses and 0 otherwise
115 static int is_lock_available(similar_page_cache_t *sp_cache)
117 return *APR_RMM_ADDR_GET(int, sp_cache->rmm, sp_cache->lock_is_available);
121 * Duplicate a string value into the a memory segment allocated from the relocatable memory.
122 * Returns: RMM_OFF_NULL on memory allocation error
123 * offset of duplicated string when all fine
125 static RMM_OFF_T(char) rmm_strdup(apr_rmm_t *rmm, const char *value)
127 size_t valuelen = strlen(value);
128 RMM_OFF_T(char) rslt = apr_rmm_malloc(rmm, valuelen+1);
129 if (rslt == RMM_OFF_NULL)
131 return RMM_OFF_NULL;
133 memcpy(APR_RMM_ADDR_GET(char, rmm, rslt), value, valuelen+1);
134 return rslt;
137 static apr_status_t similar_page_cache_kill(void *data)
139 similar_page_cache_t *sp_cache = data;
141 sp_cache->similar_pages_cache_initialized = 0;
142 if (sp_cache->rmm != NULL)
144 apr_rmm_destroy(sp_cache->rmm);
145 sp_cache->rmm = NULL;
147 #if APR_HAS_SHARED_MEMORY
148 if (sp_cache->shm != NULL) {
149 apr_status_t result = apr_shm_destroy(sp_cache->shm);
150 sp_cache->shm = NULL;
151 return result;
153 #endif
154 return APR_SUCCESS;
157 typedef struct {
158 int compiled;
159 ap_regex_t *preg;
160 } compiled_regex_info_t;
162 static int fsp_regex_match(request_rec *r, const char *regex, const char *uri_key, similar_page_cache_t *sp_cache)
164 if (sp_cache->similar_pages_regexs == NULL) {
165 sp_cache->similar_pages_regexs = apr_hash_make(r->server->process->pool);
166 if (sp_cache->similar_pages_regexs == NULL)
168 // Not enough memory to cache the regexs, so probably also not enough memory to
169 // compile the regex.
170 return 0; // Return a mismatch
173 compiled_regex_info_t *regex_info = (compiled_regex_info_t *)apr_hash_get(sp_cache->similar_pages_regexs, regex, APR_HASH_KEY_STRING);
174 if (regex_info == NULL)
176 regex_info = apr_palloc(r->server->process->pool, sizeof(compiled_regex_info_t));
177 if (regex_info == NULL)
179 ap_log_error(APLOG_MARK, APLOG_WARNING, APR_SUCCESS, r->server, "Could not allocate memory for regex_info");
180 return 0; // Return a mismatch
182 regex_info->preg = apr_palloc(r->server->process->pool, sizeof(ap_regex_t));
183 if (regex_info->preg == NULL)
185 ap_log_error(APLOG_MARK, APLOG_WARNING, APR_SUCCESS, r->server, "Could not allocate memory for regex_info->preg");
186 return 0; // Return a mismatch
188 int rslt = ap_regcomp(regex_info->preg, regex, 0);
189 if (rslt != 0)
191 ap_log_error(APLOG_MARK, APLOG_WARNING, APR_SUCCESS, r->server, "Could not compile regexp %s, return code: %d", regex, rslt);
192 regex_info->compiled = 0;
194 else
196 regex_info->compiled = 1;
198 // Store the 'compiled' regex even when the compilation failed. This prevents the same warning from re-appearing. Otherwise, the
199 // compilation will fail on each request for a page that might match this regex.
200 apr_hash_set(sp_cache->similar_pages_regexs, regex, APR_HASH_KEY_STRING, regex_info);
202 if (regex_info->compiled)
204 return ap_regexec(regex_info->preg, uri_key, 0, NULL, AP_REG_ICASE) == 0;
206 return 0; // Compilation of regex has failed at least once. Return a mismatch
209 /*****************************************************************
210 * Record of available info on a media type specified by the client
211 * (we also use 'em for encodings and languages)
213 * - Taken from mod_negotation.c
215 typedef struct accept_rec {
216 char *name; /* MUST be lowercase */
217 float quality;
218 float level;
219 char *charset; /* for content-type only */
220 } accept_rec;
222 /*****************************************************************
223 * parse quality value. atof(3) is not well-usable here, because it
224 * depends on the locale (argh).
226 * However, RFC 2616 states:
227 * 3.9 Quality Values
229 * [...] HTTP/1.1 applications MUST NOT generate more than three digits
230 * after the decimal point. User configuration of these values SHOULD also
231 * be limited in this fashion.
233 * qvalue = ( "0" [ "." 0*3DIGIT ] )
234 * | ( "1" [ "." 0*3("0") ] )
236 * This is quite easy. If the supplied string doesn't match the above
237 * definition (loosely), we simply return 1 (same as if there's no qvalue)
239 * - Taken from mod_negotation.c
241 static float atoq(const char *string)
243 if (!string || !*string) {
244 return 1.0f;
247 while (*string && apr_isspace(*string)) {
248 ++string;
251 /* be tolerant and accept qvalues without leading zero
252 * (also for backwards compat, where atof() was in use)
254 if (*string != '.' && *string++ != '0') {
255 return 1.0f;
258 if (*string == '.') {
259 /* better only one division later, than dealing with fscking
260 * IEEE format 0.1 factors ...
262 int i = 0;
264 if (*++string >= '0' && *string <= '9') {
265 i += (*string - '0') * 100;
267 if (*++string >= '0' && *string <= '9') {
268 i += (*string - '0') * 10;
270 if (*++string > '0' && *string <= '9') {
271 i += (*string - '0');
276 return (float)i / 1000.0f;
279 return 0.0f;
282 /*****************************************************************
283 * Get a single mime type entry --- one media type and parameters;
284 * enter the values we recognize into the argument accept_rec
286 * - Taken from mod_negotation.c
288 static const char *get_accept_entry(apr_pool_t *p, accept_rec *result,
289 const char *accept_line)
291 result->quality = 1.0f;
292 result->level = 0.0f;
293 result->charset = "";
296 * Note that this handles what I gather is the "old format",
298 * Accept: text/html text/plain moo/zot
300 * without any compatibility kludges --- if the token after the
301 * MIME type begins with a semicolon, we know we're looking at parms,
302 * otherwise, we know we aren't. (So why all the pissing and moaning
303 * in the CERN server code? I must be missing something).
306 result->name = ap_get_token(p, &accept_line, 0);
307 ap_str_tolower(result->name); /* You want case insensitive,
308 * you'll *get* case insensitive.
311 /* KLUDGE!!! Default HTML to level 2.0 unless the browser
312 * *explicitly* says something else.
315 if (!strcmp(result->name, "text/html") && (result->level == 0.0)) {
316 result->level = 2.0f;
318 else if (!strcmp(result->name, INCLUDES_MAGIC_TYPE)) {
319 result->level = 2.0f;
321 else if (!strcmp(result->name, INCLUDES_MAGIC_TYPE3)) {
322 result->level = 3.0f;
325 while (*accept_line == ';') {
326 /* Parameters ... */
328 char *parm;
329 char *cp;
330 char *end;
332 ++accept_line;
333 parm = ap_get_token(p, &accept_line, 1);
335 /* Look for 'var = value' --- and make sure the var is in lcase. */
337 for (cp = parm; (*cp && !apr_isspace(*cp) && *cp != '='); ++cp) {
338 *cp = apr_tolower(*cp);
341 if (!*cp) {
342 continue; /* No '='; just ignore it. */
345 *cp++ = '\0'; /* Delimit var */
346 while (*cp && (apr_isspace(*cp) || *cp == '=')) {
347 ++cp;
350 if (*cp == '"') {
351 ++cp;
352 for (end = cp;
353 (*end && *end != '\n' && *end != '\r' && *end != '\"');
354 end++);
356 else {
357 for (end = cp; (*end && !apr_isspace(*end)); end++);
359 if (*end) {
360 *end = '\0'; /* strip ending quote or return */
362 ap_str_tolower(cp);
364 if (parm[0] == 'q'
365 && (parm[1] == '\0' || (parm[1] == 's' && parm[2] == '\0'))) {
366 result->quality = atoq(cp);
368 else if (parm[0] == 'l' && !strcmp(&parm[1], "evel")) {
369 result->level = (float)atoi(cp);
371 else if (!strcmp(parm, "charset")) {
372 result->charset = cp;
376 if (*accept_line == ',') {
377 ++accept_line;
380 return accept_line;
384 /*****************************************************************
385 * Dealing with Accept... header lines ...
386 * Accept, Accept-Charset, Accept-Language and Accept-Encoding
387 * are handled by do_header_line() - they all have the same
388 * basic structure of a list of items of the format
389 * name; q=N; charset=TEXT
391 * where charset is only valid in Accept.
393 * - Taken from mod_negotation.c
395 static apr_array_header_t *parse_accept_line(apr_pool_t *p,
396 const char *accept_line)
398 apr_array_header_t *accept_recs;
400 if (!accept_line) {
401 return NULL;
404 accept_recs = apr_array_make(p, 40, sizeof(accept_rec));
405 if (accept_recs == NULL)
407 return NULL; // Nothing to allocate
409 while (*accept_line) {
410 accept_rec *new = (accept_rec *) apr_array_push(accept_recs);
411 accept_line = get_accept_entry(p, new, accept_line);
412 if (!strcmp(new->name, "*/*"))
414 apr_array_pop(accept_recs); // Discard this entry
418 return accept_recs;
422 static int match_accept_type_vs_mime_type(const char *mime_type, const char *accept_type)
424 while (*mime_type && *accept_type && *mime_type == *accept_type)
426 mime_type++;
427 accept_type++;
429 return (*mime_type == 0 && *accept_type == 0) || (*accept_type == '*');
433 // TODO: Refine. Current logic is simplistic. It only checks the mime-type part of the content-type
434 // header of the cached page (e.g. it ignores the charset) and furthermore, it ignores
435 // the 'quality'/'level' indicates in the accept header. The function returns true
436 // if the mime-type of the cached page matches at least one of the content-types indicated
437 // in the accept header
438 // Note that the foundation for more fine-grained logic has been laid. The accept-header
439 // is parsed and broken down in all the constituting elements, using code copied from
440 // module mod-negotation
441 static int fsp_accept_matches_content_type(similar_page_cache_t *sp_cache,
442 request_rec *r, RMM_OFF_T(char) content_type)
444 apr_array_header_t *accepts = parse_accept_line(r->pool, apr_table_get(r->headers_in, ACCEPT_HEADER));
445 const char *content_type_line = APR_RMM_ADDR_GET(char, sp_cache->rmm, content_type);
447 if (accepts == NULL)
449 return 0; // Can't validate content type versus accept header
451 ap_log_error(APLOG_MARK, APLOG_DEBUG, APR_SUCCESS, r->server,
452 "Comparing content type line %s versus accept line %s",
453 content_type_line, apr_table_get(r->headers_in, ACCEPT_HEADER));
455 // Only look at the mime-type (e.g. text/html) of the content-type line.
456 // Discard any other parameters like the charset
457 char *mime_type = ap_get_token(r->pool, &content_type_line, 0);
458 ap_str_tolower(mime_type);
460 accept_rec *accept_elts = (accept_rec *)accepts->elts;
461 int cnt;
462 for (cnt = 0; cnt != accepts->nelts; cnt++)
464 const char *accept_type = accept_elts[cnt].name;
465 ap_log_error(APLOG_MARK, APLOG_DEBUG, APR_SUCCESS, r->server,
466 "Comparing mime type %s versus accept type %s", mime_type, accept_type);
467 if (match_accept_type_vs_mime_type(mime_type, accept_type))
469 return 1; // A good-enough match found. Use this page.
472 return 0; // No match found.Skip this page
475 static void clear_rmm_field(apr_rmm_t *rmm, apr_rmm_off_t *offset_ptr)
477 if (*offset_ptr != RMM_OFF_NULL) {
478 apr_rmm_free(rmm, *offset_ptr);
479 *offset_ptr = RMM_OFF_NULL;
484 * Free all memory used by a cached_files_info_t structure
485 * Be aware that this function might get called while the structure is not yet complete. E.g.
486 * it gets called when an out-of-memory condition occurs during the construction
488 static void free_cached_files_info(apr_rmm_t *rmm, sp_per_content_type_t *sp_per_ct_physical, RMM_OFF_T(cached_files_info_t) cached_file_info)
490 cached_files_info_t *cfi_physical = APR_RMM_ADDR_GET(cached_files_info_t, rmm, cached_file_info);
492 // Delete the entry from the hash table
493 if (sp_per_ct_physical->cached_files_info_by_path != RMM_OFF_NULL && cfi_physical->basepath != RMM_OFF_NULL) {
494 rmm_hash_set(rmm, sp_per_ct_physical->cached_files_info_by_path, cfi_physical->basepath, APR_HASH_KEY_STRING, RMM_OFF_NULL);
497 // Update the tail entry if this was the tail entry
498 if (cached_file_info == sp_per_ct_physical->tail_file_info) {
499 sp_per_ct_physical->tail_file_info = cfi_physical->prev;
502 // Remove the entry from the (double-linked) list
503 if (cfi_physical->next != RMM_OFF_NULL) {
504 APR_RMM_ADDR_GET(cached_files_info_t, rmm, cfi_physical->next)->prev = cfi_physical->prev;
506 if (cfi_physical->prev != RMM_OFF_NULL) {
507 APR_RMM_ADDR_GET(cached_files_info_t, rmm, cfi_physical->prev)->next = cfi_physical->next;
509 else {
510 sp_per_ct_physical->cached_files_info = cfi_physical->next;
513 clear_rmm_field(rmm, &cfi_physical->basepath);
514 clear_rmm_field(rmm, &cfi_physical->uri);
515 apr_rmm_free(rmm, cached_file_info);
519 * Verify if the cached file contains a vary header. If yes, then match the headers in the request with
520 * the corresponding headers in the cached page.
521 * Returns true if there is no vary header or if the vary headers match correctly
522 * TODO: refine the logic to match the header values. According to the RFC, the comparison may
523 * ignore white-space characters in the header values (accordingly to the BNF/syntax of that specific header...).
524 * At the moment, the header values are compared literally, so in theory, this comparison is too restrictive.
526 static int match_vary_headers(similar_page_cache_t *sp_cache, request_rec *r, RMM_OFF_T(vary_headers_t)vary_headers)
528 if (vary_headers == RMM_OFF_NULL) {
529 return 1; // The cached page did not specify vary header, so the new request matches by definition
531 apr_rmm_t *rmm = sp_cache->rmm;
532 while (vary_headers != RMM_OFF_NULL) {
533 vary_headers_t *vary_headers_physical = APR_RMM_ADDR_GET(vary_headers_t, rmm, vary_headers);
534 const char *headername = APR_RMM_ADDR_GET(char, rmm, vary_headers_physical->name);
535 ap_log_error(APLOG_MARK, APLOG_DEBUG, APR_SUCCESS, r->server, "Comparing vary header %s", headername);
536 if (strcmp(headername, "*") == 0) {
537 // The special 'header name' * signifies that the server always varies stuff in an undisclosed manner.
538 // The similar page matching will probably yield bad results. Ignore this page.
539 return 0;
541 const char *cached_headervalue = (vary_headers_physical->value == RMM_OFF_NULL) ?
542 NULL : APR_RMM_ADDR_GET(char, rmm, vary_headers_physical->value);
543 const char *req_headervalue = apr_table_get(r->headers_in, headername);
544 ap_log_error(APLOG_MARK, APLOG_DEBUG, APR_SUCCESS, r->server, "Cached value: %s, request value: %s",
545 cached_headervalue, req_headervalue);
546 if (req_headervalue == NULL && cached_headervalue != NULL) {
547 return 0; // Expecting a value but did not get one
549 if (req_headervalue != NULL && cached_headervalue == NULL) {
550 return 0; // Expecting empty header but got a value
552 if (req_headervalue != NULL && strcmp(req_headervalue, cached_headervalue) != 0) {
553 return 0; // The new and old header value differ
555 vary_headers = vary_headers_physical->next;
557 return 1; // All vary headers are the same
561 * Try to open the file indicated in cfi_physical structure
562 * Returns APR_SUCCESS if the file was successfully opened, in which case the dobj structure
563 * will have been properly updated.
564 * Returns other error codes in case of problems.
565 * WARNING: When the file no longer exists, the structure cfi_physical will be deleted from memory and
566 * from the linked-list. It means that the caller should evaluate cfi_physical->next *before* invoking
567 * this function.
569 static apr_status_t open_cached_file(disk_cache_object_t *dobj, request_rec *r,
570 similar_page_cache_t *sp_cache, sp_per_content_type_t *sp_per_ct_physical,
571 RMM_OFF_T(cached_files_info_t) cached_file_info)
573 apr_rmm_t *rmm = sp_cache->rmm;
574 cached_files_info_t *cfi_physical = APR_RMM_ADDR_GET(cached_files_info_t, sp_cache->rmm, cached_file_info);
575 const char *fullpath = apr_pstrcat(r->pool, sp_cache->cache_root, "/",
576 APR_RMM_ADDR_GET(char, rmm, cfi_physical->basepath), CACHE_DATA_SUFFIX, NULL);
577 int flags = APR_READ|APR_BINARY;
578 #ifdef APR_SENDFILE_ENABLED
579 flags |= APR_SENDFILE_ENABLED;
580 #endif
581 apr_status_t rc = apr_file_open(&dobj->fd, fullpath, flags, 0, r->pool);
582 if (rc == APR_SUCCESS)
584 // Successfully opened the file. Try to obtain the file-size and return the completed dobj
585 // to the caller
586 apr_finfo_t finfo;
587 rc = apr_file_info_get(&finfo, APR_FINFO_SIZE, dobj->fd);
588 if (rc == APR_SUCCESS) {
589 dobj->file_size = finfo.size;
590 ap_log_error(APLOG_MARK, APLOG_DEBUG, APR_SUCCESS, r->server,
591 "Basing CRCSYNC/delta-http for requested URL on cached page for URL %s of size %" APR_SIZE_T_FMT,
592 APR_RMM_ADDR_GET(char, sp_cache->rmm, cfi_physical->uri), dobj->file_size);
593 return APR_SUCCESS;
595 // Could not obtain file info for a mysterious reason. Skip this file.
596 apr_file_close(dobj->fd);
598 else
600 // Apparently the cached file is no longer there. Maybe it got cleaned by htcacheclean?
601 if (is_lock_available(sp_cache)) {
602 // Remove the entry. But only if this process could obtain the semaphore...
603 free_cached_files_info(rmm, sp_per_ct_physical, cached_file_info);
606 return rc; // Could not open file or obtain file-info for whatever reason.
610 * Critical section of the code to find similar pages. While this code is in progress, no updates to the data
611 * structures may happen by other threads/processes, like by function 'update_or_add_similar_page(...), which is invoked
612 * when a new file has been saved to the disk cache.
614 * Please note that this function itself can update the 'free-pages' list if the code discovers that the data
615 * structure is referencing a file that no longer exists. Apart from that update-block, the code is fully re-entrant.
616 * With other words: multiple requests can enter this code concurrently, as long as they don't update the 'free-pages'
617 * list and as long as it does not happen concurrently with the 'update_or_add_similar_page(...) function
619 * At the moment, the code block that updates the 'free-pages' list checks if a lock could be obtained. If no lock could
620 * be obtained, it does not update the list. It only updates the list if a lock could be obtained.
622 * The locking is currently rather coarse grained: when locks are available, the (global mutex) makes sure that the access
623 * to this function and to the 'update_or_add_similar_page(...) function is exclusive. On the other hand, when the
624 * global mutex could not be initialized and as such is not available, the 'update_or_add_similar_page(...) function
625 * is disabled and only the 'find-similar-page' function works, for data that got loaded during the server startup.
627 * In order to increase the scalability, a more fine-grained locking could be implemented by carefully assessing which
628 * parts of the 'update_or_add_similar_page(...) function conflict with data structures used by this 'find_similar_page'
629 * function and then adding the appropriate locks where required.
631 static apr_status_t find_similar_page_cs(disk_cache_object_t *dobj, request_rec *r, similar_page_cache_t *sp_cache, const char *host)
633 apr_rmm_t *rmm = sp_cache->rmm;
634 RMM_OFF_T(sp_per_regex_t) sp_per_regex = rmm_hash_get(rmm, sp_cache->similar_pages_per_host, host, APR_HASH_KEY_STRING);
635 while (sp_per_regex != RMM_OFF_NULL)
637 sp_per_regex_t *sp_per_regex_physical = APR_RMM_ADDR_GET(sp_per_regex_t, rmm, sp_per_regex);
638 if (fsp_regex_match(r, APR_RMM_ADDR_GET(char, rmm, sp_per_regex_physical->regex), r->unparsed_uri, sp_cache))
640 // Found the largest matching regex. Find a group of pages with an appropriate content type
641 RMM_OFF_T(sp_per_content_type_t) sp_per_ct = sp_per_regex_physical->similar_pages_per_content_type;
642 while (sp_per_ct != RMM_OFF_NULL)
644 sp_per_content_type_t *sp_per_ct_physical = APR_RMM_ADDR_GET(sp_per_content_type_t, rmm, sp_per_ct);
645 if (fsp_accept_matches_content_type(sp_cache, r, sp_per_ct_physical->content_type))
647 // Found list of pages with appropriate content type for the matching regex
648 // Now try to open a page associated with this regex and content type
649 RMM_OFF_T(cached_files_info_t) cached_file_info = sp_per_ct_physical->cached_files_info;
650 while (cached_file_info != RMM_OFF_NULL)
652 cached_files_info_t *cfi_physical = APR_RMM_ADDR_GET(cached_files_info_t, sp_cache->rmm, cached_file_info);
653 RMM_OFF_T(cached_files_info_t) next_cfi = cfi_physical->next;
654 if (match_vary_headers(sp_cache, r, cfi_physical->vary_headers)) {
655 if (open_cached_file(dobj, r, sp_cache, sp_per_ct_physical, cached_file_info) == APR_SUCCESS) {
656 return APR_SUCCESS; // File successfully opened. Done.
659 cached_file_info = next_cfi;
660 } // while (cached_file_info != RMM_OFF_NULL)
661 } // if (find_similar_page_accept_matches_content_type(sp_cache, r, sp_per_ct_physical->content_type))
662 sp_per_ct = sp_per_ct_physical->next;
663 } // while (sp_per_ct != RMM_OFF_NULL)
664 } // if (find_similar_page_regex_match(r, APR_RMM_ADDR_GET(char, rmm, sp_per_regex_physical->regex), r->unparsed_uri, sp_cache))
665 sp_per_regex = sp_per_regex_physical->next;
666 } // while (sp_per_regex != RMM_OFF_NULL)
667 ap_log_error(APLOG_MARK, APLOG_DEBUG, APR_SUCCESS, r->server, "Could not find a similar page for the requesed URL");
668 return DECLINED;
672 * Find a page in the cache for an URL that is similar to the requested URL and that can
673 * fullfill at least one of the expected mime-types indicated in the "Accept" header
674 * This page can then be used by the CRCCache as basis for the CRCSYNC/Delta-http encoding.
676 apr_status_t find_similar_page(disk_cache_object_t *dobj, request_rec *r, similar_page_cache_t *sp_cache)
678 if (!sp_cache->similar_pages_cache_initialized)
680 ap_log_error(APLOG_MARK, APLOG_DEBUG, APR_SUCCESS, r->server, "Similar page cache is not initialized");
681 return DECLINED;
683 const char *host = apr_table_get(r->headers_in, HOST_HEADER);
684 if (!host) {
685 ap_log_error(APLOG_MARK, APLOG_DEBUG, APR_SUCCESS, r->server, "Can't find host header in the request");
686 return DECLINED;
689 apr_status_t findrslt;
690 if (is_lock_available(sp_cache)) {
691 apr_status_t lockrslt = apr_global_mutex_lock(sp_cache->fs_cache_lock);
692 if (lockrslt != APR_SUCCESS)
694 ap_log_error(APLOG_MARK, APLOG_WARNING, lockrslt, r->server, "Can't obtain the lock");
695 return lockrslt;
697 findrslt = find_similar_page_cs(dobj, r, sp_cache, host);
698 lockrslt = apr_global_mutex_unlock(sp_cache->fs_cache_lock);
699 if (lockrslt != APR_SUCCESS)
701 ap_log_error(APLOG_MARK, APLOG_WARNING, lockrslt, r->server, "Can't release the lock");
704 else {
705 findrslt = find_similar_page_cs(dobj, r, sp_cache, host);
707 return findrslt;
711 * Create info about a cached file
712 * Returns RMM_OFF_NULL when a memory allocation error has occured.
714 static RMM_OFF_T(cached_files_info_t) create_cached_files_info(apr_rmm_t *rmm,
715 const char *basepath, const char *uri, RMM_OFF_T(vary_headers_t) vary_headers)
717 RMM_OFF_T(cached_files_info_t) cached_files_info = apr_rmm_calloc(rmm, sizeof(cached_files_info_t));
718 if (cached_files_info == RMM_OFF_NULL)
720 return cached_files_info;
722 cached_files_info_t *cfi_physical = APR_RMM_ADDR_GET(cached_files_info_t, rmm, cached_files_info);
723 cfi_physical->basepath = rmm_strdup(rmm, basepath);
724 cfi_physical->uri = rmm_strdup(rmm, uri);
725 if (cfi_physical->basepath == RMM_OFF_NULL || cfi_physical->uri == RMM_OFF_NULL)
727 clear_rmm_field(rmm, &cfi_physical->basepath);
728 clear_rmm_field(rmm, &cfi_physical->uri);
729 apr_rmm_free(rmm, cached_files_info);
730 return RMM_OFF_NULL;
732 cfi_physical->prev = RMM_OFF_NULL;
733 cfi_physical->next = RMM_OFF_NULL;
734 cfi_physical->vary_headers = vary_headers;
736 return cached_files_info;
740 * Create a 'similar pages per content type' structure for the current basepath, uri and content_type
741 * Returns NULL when a memory allocation error has occured
743 static RMM_OFF_T(sp_per_content_type_t) create_sp_per_content_type(apr_rmm_t *rmm,
744 const char *basepath, const char *uri, const char *content_type, RMM_OFF_T(vary_headers_t)vary_headers)
746 RMM_OFF_T(sp_per_content_type_t) sp_per_ct = apr_rmm_calloc(rmm, sizeof(sp_per_content_type_t));
747 if (sp_per_ct == RMM_OFF_NULL)
749 return RMM_OFF_NULL; // Memory allocation failure!
751 sp_per_content_type_t *sp_per_ct_physical = APR_RMM_ADDR_GET(sp_per_content_type_t, rmm, sp_per_ct);
752 sp_per_ct_physical->next = RMM_OFF_NULL;
753 sp_per_ct_physical->content_type = rmm_strdup(rmm, content_type);
754 if (sp_per_ct_physical->content_type == RMM_OFF_NULL)
756 apr_rmm_free(rmm, sp_per_ct);
757 return RMM_OFF_NULL;
760 sp_per_ct_physical->cached_files_info = create_cached_files_info(rmm, basepath, uri, vary_headers);
761 if (sp_per_ct_physical->cached_files_info == RMM_OFF_NULL)
763 apr_rmm_free(rmm, sp_per_ct_physical->content_type);
764 apr_rmm_free(rmm, sp_per_ct);
765 return RMM_OFF_NULL;
767 sp_per_ct_physical->tail_file_info = sp_per_ct_physical->cached_files_info;
769 sp_per_ct_physical->cached_files_info_by_path = rmm_hash_make(rmm);
770 if (sp_per_ct_physical->cached_files_info_by_path == RMM_OFF_NULL)
772 free_cached_files_info(rmm, sp_per_ct_physical, sp_per_ct_physical->cached_files_info);
773 apr_rmm_free(rmm, sp_per_ct_physical->content_type);
774 apr_rmm_free(rmm, sp_per_ct);
775 return RMM_OFF_NULL;
777 // FIXME: rmm_hash_set should be able to return an out-of-memory condition when appropriate so that *this* function can properly handle
778 // the error condition...
779 rmm_hash_set(rmm, sp_per_ct_physical->cached_files_info_by_path,
780 APR_RMM_ADDR_GET(cached_files_info_t, rmm, sp_per_ct_physical->cached_files_info)->basepath, APR_HASH_KEY_STRING,
781 sp_per_ct_physical->cached_files_info);
783 return sp_per_ct;
788 * Create a 'similar pages per regex' structure for the current regex, basepath, uri and content_type
789 * Returns NULL when a memory allocation error has occured
791 static RMM_OFF_T(sp_per_regex_t) create_sp_per_regex(apr_rmm_t *rmm,
792 const char *regex, const char *basepath, const char *uri, const char *content_type, RMM_OFF_T(vary_headers_t)vary_headers)
794 RMM_OFF_T(sp_per_regex_t) sp_per_regex = apr_rmm_calloc(rmm, sizeof(sp_per_regex_t));
795 if (sp_per_regex == RMM_OFF_NULL)
797 return RMM_OFF_NULL; // Memory allocation failure!
799 sp_per_regex_t *sp_per_regex_physical = APR_RMM_ADDR_GET(sp_per_regex_t, rmm, sp_per_regex);
800 sp_per_regex_physical->next = RMM_OFF_NULL;
801 sp_per_regex_physical->regex_len = strlen(regex);
802 sp_per_regex_physical->regex = rmm_strdup(rmm, regex);
803 if (sp_per_regex_physical->regex == RMM_OFF_NULL)
805 apr_rmm_free(rmm, sp_per_regex);
806 return RMM_OFF_NULL;
808 sp_per_regex_physical->similar_pages_per_content_type = create_sp_per_content_type(rmm, basepath, uri, content_type, vary_headers);
809 if (sp_per_regex_physical->similar_pages_per_content_type == RMM_OFF_NULL)
811 apr_rmm_free(rmm, sp_per_regex_physical->regex);
812 apr_rmm_free(rmm, sp_per_regex);
813 return RMM_OFF_NULL;
815 return sp_per_regex;
819 * Add a new cached file to the list of cached files for the current content type or update the entry if it
820 * is already present
821 * Returns: 1 on memory allocation error
822 * 0 when all fine
824 static int add_cached_file_to_content_type(similar_page_cache_t *sp_cache, sp_per_content_type_t *sp_per_ct_physical,
825 const char *basepath, const char *uri, RMM_OFF_T(vary_headers_t) vary_headers)
827 apr_rmm_t *rmm = sp_cache->rmm;
828 RMM_OFF_T(cached_files_info_t) cached_file_info;
829 cached_files_info_t *cfi_physical;
831 // Make the cached_file_info record
832 cached_file_info = create_cached_files_info(rmm, basepath, uri, vary_headers);
833 if (cached_file_info == RMM_OFF_NULL) {
834 return 1; // Could not allocate memory. Can't store the info.
836 cfi_physical = APR_RMM_ADDR_GET(cached_files_info_t, rmm, cached_file_info);
838 // Insert the new entry at the head of the list
839 cfi_physical->next = sp_per_ct_physical->cached_files_info;
840 if (cfi_physical->next != RMM_OFF_NULL) {
841 // There was already something in the list. Make the old head entry point back to
842 // this new head entry
843 APR_RMM_ADDR_GET(cached_files_info_t, rmm, cfi_physical->next)->prev = cached_file_info;
845 else {
846 // The list was empty. This new entry is now by definition a tail entry
847 sp_per_ct_physical->tail_file_info = cached_file_info;
849 sp_per_ct_physical->cached_files_info = cached_file_info;
851 // Remove old version of the page (if it exists) from the list
852 RMM_OFF_T(cached_files_info_t) old_cached_file = rmm_hash_get(rmm,
853 sp_per_ct_physical->cached_files_info_by_path,
854 basepath, APR_HASH_KEY_STRING);
855 if (old_cached_file != RMM_OFF_NULL) {
856 free_cached_files_info(rmm, sp_per_ct_physical, old_cached_file);
859 // Add the new version to the reverse index
860 // FIXME: deal with failure of rmm_hash_set (once rmm_hash_set has been fixed to return an out-of-memory condition
861 // when appropriate
862 rmm_hash_set(rmm, sp_per_ct_physical->cached_files_info_by_path, cfi_physical->basepath, APR_HASH_KEY_STRING, cached_file_info);
864 if (rmm_hash_count(rmm, sp_per_ct_physical->cached_files_info_by_path) > 40 /* TODO: make this threshold configurable */)
866 // Only maintain info about the (40) most recently cached pages per host per regex per content-type
867 // The chance that all of them point to meanwhile deleted/obsolete files is very small, considering
868 // the fact that each freshly cached file gets inserted at the head of the list, so it does not make
869 // much sense to fill-up the memory with a longer list.
870 free_cached_files_info(rmm, sp_per_ct_physical, sp_per_ct_physical->tail_file_info);
873 return 0; // Cached file info successfully added
877 * Add a new cached file to the list of cached files for the current regular expression or update the page if it
878 * is already present
879 * Returns: 1 on memory allocation error
880 * 0 when all fine
882 static int add_cached_file_to_regex(similar_page_cache_t *sp_cache, sp_per_regex_t *sp_per_regex_physical,
883 const char *basepath, const char *uri, const char *content_type, RMM_OFF_T(vary_headers_t)vary_headers)
885 RMM_OFF_T(sp_per_content_type_t) sp_per_ct;
886 apr_rmm_t *rmm = sp_cache->rmm;
887 sp_per_ct = sp_per_regex_physical->similar_pages_per_content_type;
888 while (sp_per_ct != RMM_OFF_NULL) {
889 sp_per_content_type_t *sp_per_ct_physical = APR_RMM_ADDR_GET(sp_per_content_type_t, rmm, sp_per_ct);
890 if (!strcmp(content_type, APR_RMM_ADDR_GET(char, rmm, sp_per_ct_physical->content_type))) {
891 // Found the correct entry. Add or update the page here
892 return add_cached_file_to_content_type(sp_cache, sp_per_ct_physical, basepath, uri, vary_headers);
894 sp_per_ct = sp_per_ct_physical->next;
896 // There is nothing yet for this content type. Add it to the list
897 sp_per_ct = create_sp_per_content_type(rmm, basepath, uri, content_type, vary_headers);
898 if (sp_per_ct == RMM_OFF_NULL) {
899 return 1;
901 // Add it to the head of the list
902 APR_RMM_ADDR_GET(sp_per_content_type_t, rmm, sp_per_ct)->next = sp_per_regex_physical->similar_pages_per_content_type;
903 sp_per_regex_physical->similar_pages_per_content_type = sp_per_ct;
904 return 0;
909 * Add a new page to the list of similar pages for current host or update an existing page
910 * Returns: 1 on memory allocation error
911 * 0 when all fine
913 static int add_similar_pages_info(similar_page_cache_t *sp_cache, RMM_OFF_T(sp_per_regex_t) *sp_per_regex_p,
914 const char *regex, const char *basepath, const char *uri, const char *content_type, RMM_OFF_T(vary_headers_t)vary_headers)
916 apr_rmm_t *rmm = sp_cache->rmm;
917 size_t regex_len = strlen(regex);
918 while (1)
920 RMM_OFF_T(sp_per_regex_t) curr_sp_per_regex = *sp_per_regex_p;
921 sp_per_regex_t *sp_per_regex_physical = APR_RMM_ADDR_GET(sp_per_regex_t, rmm, curr_sp_per_regex);
922 if (regex_len == sp_per_regex_physical->regex_len && strcmp(regex, APR_RMM_ADDR_GET(char, rmm, sp_per_regex_physical->regex))==0)
924 // Found a perfect match. Add or update the page to the head of the current pages list
925 return add_cached_file_to_regex(sp_cache, sp_per_regex_physical, basepath, uri, content_type, vary_headers);
927 else
929 if (regex_len > sp_per_regex_physical->regex_len )
931 // No matching regex found that is longer then the current regex.
932 // Insert the new entry here in the list, so that the list remains sorted in descending order on regex_len
933 RMM_OFF_T(sp_per_regex_t) new_sp_per_regex = create_sp_per_regex(rmm, regex, basepath, uri, content_type, vary_headers);
934 if (new_sp_per_regex == RMM_OFF_NULL)
936 return 1; // Out of memory condition occurred
938 APR_RMM_ADDR_GET(sp_per_regex_t, rmm, new_sp_per_regex)->next = curr_sp_per_regex;
939 *sp_per_regex_p = new_sp_per_regex;
940 return 0; // New page succesfully inserted
942 else
944 if (sp_per_regex_physical->next == RMM_OFF_NULL)
946 // Reached tail of the list. The new regex is shorter then any of the existing ones
947 // Insert new entry to the end of the list
948 RMM_OFF_T(sp_per_regex_t) new_sp_per_regex = create_sp_per_regex(rmm, regex, basepath, uri, content_type, vary_headers);
949 if (new_sp_per_regex == RMM_OFF_NULL)
951 return 1; // Out of memory condition occurred
953 sp_per_regex_physical->next = new_sp_per_regex;
954 return 0; // New page succesfully inserted
956 // Evaluate the next entry
957 sp_per_regex_p = &sp_per_regex_physical->next;
961 return 0;
965 * Add (or update) a cached page to the 'similar pages' cache
966 * Returns: 1 on memory allocation error
967 * 0 when all fine
968 * The invoking function may want to log a warning in case of memory
969 * allocation error so that the system administrator can tune the cache
970 * parameters if this happens too often
972 static int add_cached_page(similar_page_cache_t *sp_cache, const char *regex, const char *host,
973 const char *basepath, const char *uri, const char *content_type, RMM_OFF_T(vary_headers_t)vary_headers)
975 apr_rmm_t *rmm = sp_cache->rmm;
976 RMM_OFF_T(sp_per_regex_t) sp_per_regex = rmm_hash_get(rmm, sp_cache->similar_pages_per_host, host, APR_HASH_KEY_STRING);
977 if (sp_per_regex == RMM_OFF_NULL)
979 // There is no info yet for the current host. Make the first entry.
980 RMM_OFF_T(char) host_offset = rmm_strdup(rmm, host);
981 if (host_offset == RMM_OFF_NULL) {
982 return 1; // Could not allocate memory
984 sp_per_regex = create_sp_per_regex(rmm, regex, basepath, uri, content_type, vary_headers);
985 if (sp_per_regex == RMM_OFF_NULL) {
986 apr_rmm_free(rmm, host_offset);
987 return 1; // Could not allocate memory!
989 rmm_hash_set(rmm, sp_cache->similar_pages_per_host, host_offset, APR_HASH_KEY_STRING, sp_per_regex);
990 return 0; // All fine
992 else
994 // The current entry already contains similar pages info. Add new or updated page to the list
995 int rslt = add_similar_pages_info(sp_cache, &sp_per_regex, regex, basepath, uri, content_type, vary_headers);
996 return rslt;
1001 * Allocate and initialze an empty similar page cache
1003 static apr_status_t similar_page_cache_init(apr_pool_t *pool, server_rec *s, similar_page_cache_t *sp_cache)
1005 #if APR_HAS_SHARED_MEMORY
1006 ap_log_error(APLOG_MARK, APLOG_DEBUG, APR_SUCCESS, s, "APR supports shared memory");
1007 apr_status_t result;
1008 apr_size_t requested_size;
1009 apr_size_t retrieved_size;
1011 if (sp_cache->cache_file) {
1012 /* Remove any existing shm segment with this name. */
1013 apr_shm_remove(sp_cache->cache_file, pool);
1016 requested_size = APR_ALIGN_DEFAULT(sp_cache->cache_bytes);
1017 result = apr_shm_create(&sp_cache->shm, requested_size, sp_cache->cache_file, pool);
1018 if (result != APR_SUCCESS) {
1019 ap_log_error(APLOG_MARK, APLOG_ERR, result, s,
1020 "Unable to obtain %" APR_SIZE_T_FMT " bytes shared memory", requested_size);
1021 return result;
1024 /* Determine the usable size of the shm segment. */
1025 retrieved_size = apr_shm_size_get(sp_cache->shm);
1026 ap_log_error(APLOG_MARK, APLOG_DEBUG, APR_SUCCESS, s,
1027 "Requested %" APR_SIZE_T_FMT " bytes shared memory, retrieved %" APR_SIZE_T_FMT " bytes",
1028 requested_size, retrieved_size);
1030 /* This will create a rmm "handler" to get into the shared memory area */
1031 result = apr_rmm_init(&sp_cache->rmm, NULL,
1032 apr_shm_baseaddr_get(sp_cache->shm), retrieved_size,
1033 pool);
1034 if (result != APR_SUCCESS) {
1035 ap_log_error(APLOG_MARK, APLOG_ERR, result, s, "Unable to initialize rmm handler for (shared) memory");
1036 return result;
1038 #else
1039 void *local_memory = apr_palloc(pool, sp_cache->cache_bytes);
1040 if (local_memory == NULL)
1042 ap_log_error(APLOG_MARK, APLOG_ERR, result, s,
1043 "Unable to obtain %" APR_SIZE_T_FMT " bytes of memory", requested_size);
1046 /* This will create a rmm "handler" to get into the memory area */
1047 result = apr_rmm_init(&sp_cache->rmm, NULL,
1048 local_memory, sp_cache->cache_bytes,
1049 pool);
1050 if (result != APR_SUCCESS) {
1051 ap_log_error(APLOG_MARK, APLOG_ERR, result, s, "Unable to initialize rmm handler for (shared) memory");
1052 return result;
1055 #endif
1057 apr_pool_cleanup_register(pool, sp_cache, similar_page_cache_kill, apr_pool_cleanup_null);
1059 sp_cache->similar_pages_per_host = rmm_hash_make(sp_cache->rmm);
1060 if (sp_cache->similar_pages_per_host == RMM_OFF_NULL) {
1061 ap_log_error(APLOG_MARK, APLOG_ERR, APR_EGENERAL, s, "Unable to allocate memory for similar pages info cache");
1062 return APR_EGENERAL;
1065 sp_cache->lock_is_available = apr_rmm_calloc(sp_cache->rmm, sizeof(int));
1066 if (sp_cache->lock_is_available == RMM_OFF_NULL) {
1067 ap_log_error(APLOG_MARK, APLOG_ERR, APR_EGENERAL, s, "Unable to allocate memory for similar pages info cache");
1068 return APR_EGENERAL;
1071 sp_cache->vary_headers_cache = rmm_hash_make(sp_cache->rmm);
1072 if (sp_cache->vary_headers_cache == RMM_OFF_NULL) {
1073 ap_log_error(APLOG_MARK, APLOG_ERR, APR_EGENERAL, s, "Unable to allocate memory for similar pages info cache");
1074 return APR_EGENERAL;
1077 return APR_SUCCESS;
1080 static apr_status_t make_vary_headers(apr_pool_t *p, server_rec *s, similar_page_cache_t *sp_cache,
1081 apr_table_t *req_hdrs, apr_table_t *resp_hdrs, RMM_OFF_T(vary_headers_t) *vary_headers_p)
1083 *vary_headers_p = RMM_OFF_NULL;
1084 apr_rmm_t *rmm = sp_cache->rmm;
1085 const char *vary = apr_table_get(resp_hdrs, VARY_HEADER);
1086 if (vary != NULL)
1088 char *headername;
1089 char *vary_cache_key = "";
1090 char *separator="";
1091 while ((headername = ap_get_token(p, &vary, 1)) != NULL && strlen(headername) != 0)
1093 // Ignore 'Accept-Encoding' vary header; we transform anything anyway to identity coding before storing it in the cache
1094 // so it does not matter what the server has done with respect to the content-encoding.
1095 if (strcmp(headername, ACCEPT_ENCODING_HEADER) != 0) {
1096 vary_cache_key = apr_pstrcat(p, vary_cache_key, separator, headername, "=", apr_table_get(req_hdrs, headername), NULL);
1097 separator=", ";
1100 if (*vary_cache_key == 0) {
1101 // Apparently the content only varies based on the 'Accept-Encoding', which we ignore.
1102 return APR_SUCCESS;
1104 RMM_OFF_T(vary_headers_t) vary_headers = rmm_hash_get(sp_cache->rmm, sp_cache->vary_headers_cache, vary_cache_key, APR_HASH_KEY_STRING);
1105 ap_log_error(APLOG_MARK, APLOG_DEBUG, APR_SUCCESS, s, "Vary cache key: %s, found in cache?: %s",
1106 vary_cache_key, (vary_headers == RMM_OFF_NULL) ? "No" : "Yes");
1107 if (vary_headers == RMM_OFF_NULL) {
1108 // This vary headers combination is not yet cached. Make the structure and cache it
1109 vary = apr_table_get(resp_hdrs, VARY_HEADER); // Get again the vary header
1110 while ((headername = ap_get_token(p, &vary, 1)) != NULL && strlen(headername) != 0)
1112 // Ignore 'Accept-Encoding' vary header; we transform anything anyway to identity coding before storing it in the cache
1113 // so it does not matter what the server has done with respect to the content-encoding.
1114 if (strcmp(headername, ACCEPT_ENCODING_HEADER) != 0) {
1115 // Allocate the new entry
1116 RMM_OFF_T(vary_headers_t) new_vary_header = apr_rmm_malloc(rmm, sizeof(vary_headers_t));
1117 if (new_vary_header == RMM_OFF_NULL) {
1118 return 1; // Could not allocate memory
1120 vary_headers_t *new_vh_physical = APR_RMM_ADDR_GET(vary_headers_t, rmm, new_vary_header);
1122 // Put the new vary header at the head of the list of entries
1123 new_vh_physical->next = vary_headers;
1124 vary_headers = new_vary_header;
1126 if ((new_vh_physical->name = rmm_strdup(rmm, headername)) == RMM_OFF_NULL) {
1127 return 1;
1130 new_vh_physical->value = RMM_OFF_NULL;
1131 const char *value = apr_table_get(req_hdrs, headername);
1132 if (value != NULL)
1134 if ((new_vh_physical->value = rmm_strdup(rmm, value)) == RMM_OFF_NULL) {
1135 return 1;
1140 rmm_hash_set(sp_cache->rmm, sp_cache->vary_headers_cache, rmm_strdup(rmm, vary_cache_key), APR_HASH_KEY_STRING, vary_headers);
1143 *vary_headers_p = vary_headers;
1145 return APR_SUCCESS;
1149 * Load the info from the file-cache into the 'find similar page' cache
1151 static apr_status_t similar_page_cache_load(apr_pool_t *ptemp, server_rec *s, const char *abs_dirname, const char *rel_dirname, similar_page_cache_t *sp_cache)
1153 apr_status_t result;
1154 apr_dir_t *dirinfo; // structure for referencing directories
1155 apr_finfo_t fileinfo; // file information structure
1157 // ap_log_error(APLOG_MARK, APLOG_DEBUG, APR_SUCCESS, s, "Opening directory %s", abs_dirname);
1158 result = apr_dir_open(&dirinfo, abs_dirname, ptemp);
1159 // ap_log_error(APLOG_MARK, APLOG_DEBUG, APR_SUCCESS, s, "Result: %d", result);
1160 if (result != APR_SUCCESS)
1162 ap_log_error(APLOG_MARK, APLOG_WARNING, result, s, "Unable to open directory %s", abs_dirname);
1163 return result;
1165 while (apr_dir_read(&fileinfo, 0, dirinfo) == APR_SUCCESS)
1167 if (!strcmp(fileinfo.name, ".") || !strcmp(fileinfo.name, ".."))
1169 // Do not recursively go into current or parent directory!
1170 continue;
1172 if (fileinfo.filetype == APR_DIR)
1174 const char *sub_abs_dirname = apr_pstrcat(ptemp, abs_dirname, "/", fileinfo.name, NULL);
1175 const char *sub_rel_dirname = (*rel_dirname == 0) ? apr_pstrdup(ptemp, fileinfo.name) :
1176 apr_pstrcat(ptemp, rel_dirname, "/", fileinfo.name, NULL);
1177 if (similar_page_cache_load(ptemp, s, sub_abs_dirname, sub_rel_dirname, sp_cache) != APR_SUCCESS)
1179 continue; // skip this sub directory and process the next one
1182 else if (fileinfo.filetype == APR_REG)
1184 // ap_log_error(APLOG_MARK, APLOG_DEBUG, APR_SUCCESS, s, "About to open file: %s", fileinfo.name);
1185 if (strstr(fileinfo.name, CACHE_HEADER_SUFFIX) != NULL)
1187 // ap_log_error(APLOG_MARK, APLOG_DEBUG, APR_SUCCESS, s, "Its a header file");
1188 // Build the key (basepath) for the cache.
1189 // It consists of the relative path name exluding the .header extension
1190 char *basepath = apr_pstrdup(ptemp, fileinfo.name);
1191 *strstr(basepath, CACHE_HEADER_SUFFIX)=0;
1192 basepath = apr_pstrcat(ptemp, rel_dirname, "/", basepath, NULL);
1193 // ap_log_error(APLOG_MARK, APLOG_DEBUG, APR_SUCCESS, s, "Basepath: %s", basepath);
1195 char *full_filepath = apr_pstrcat(ptemp, abs_dirname, "/", fileinfo.name, NULL);
1196 // ap_log_error(APLOG_MARK, APLOG_DEBUG, APR_SUCCESS, s, "Full_filepath: %s", full_filepath);
1198 apr_file_t *fd;
1199 result = apr_file_open(&fd, full_filepath, APR_READ|APR_BINARY|APR_BUFFERED, 0, ptemp);
1200 if (result != APR_SUCCESS)
1202 ap_log_error(APLOG_MARK, APLOG_WARNING, result, s, "Failed to open file %s", full_filepath);
1203 continue; // Skip this file
1206 apr_uint32_t format;
1207 apr_size_t len;
1209 /* Read and evaluate the format from the cache file */
1210 len = sizeof(format);
1211 apr_file_read_full(fd, &format, len, &len);
1212 if (format == VARY_FORMAT_VERSION) {
1213 // TODO: Smartly handle "vary" header files. But skip them for the time being.
1214 ap_log_error(APLOG_MARK, APLOG_DEBUG, APR_SUCCESS, s, "Skipping vary header file %s", full_filepath);
1215 apr_file_close(fd);
1216 continue; // Skip this file
1218 if (format != DISK_FORMAT_VERSION) {
1219 ap_log_error(APLOG_MARK, APLOG_WARNING, APR_SUCCESS, s,
1220 "File %s has a version mismatch. File had version %d, but expected version is %d",
1221 full_filepath, format, DISK_FORMAT_VERSION);
1222 apr_file_close(fd);
1223 continue; // Skip this file
1225 // Format OK, rewind to file begin
1226 apr_off_t offset = 0;
1227 apr_file_seek(fd, APR_SET, &offset);
1229 // Read metadata from file
1230 cache_object_t *obj = apr_pcalloc(ptemp, sizeof(cache_object_t));;
1231 disk_cache_object_t *dobj = apr_pcalloc(ptemp, sizeof(disk_cache_object_t));;
1232 cache_info_t *cache_info = &(obj->info);
1233 result = file_cache_recall_mydata(ptemp, fd, cache_info, dobj, 0);
1234 if (result != APR_SUCCESS)
1236 ap_log_error(APLOG_MARK, APLOG_WARNING, result, s,
1237 "Problem encountered reading meta data from %s", full_filepath);
1238 apr_file_close(fd);
1239 continue; // Skip this file
1242 // Read request and response headers
1243 apr_table_t *req_hdrs = apr_table_make(ptemp, 20);
1244 apr_table_t *resp_hdrs = apr_table_make(ptemp, 20);
1245 result = read_table(s, resp_hdrs, fd);
1246 if (result != APR_SUCCESS)
1248 ap_log_error(APLOG_MARK, APLOG_WARNING, result, s, "Failed to read response headers from file %s", full_filepath);
1249 apr_file_close(fd);
1250 continue; // Skip this file
1252 result = read_table(s, req_hdrs, fd);
1253 apr_file_close(fd);
1254 if (result != APR_SUCCESS)
1256 ap_log_error(APLOG_MARK, APLOG_WARNING, result, s, "Failed to read request headers from file %s", full_filepath);
1257 continue; // Skip this file
1260 // Add file to 'similar pages' cache if host, crcsync_similar and content_type headers are present
1261 const char *hostname = apr_table_get(req_hdrs, HOST_HEADER);
1262 const char *crcsync_similar = apr_table_get(resp_hdrs, CRCSYNC_SIMILAR_HEADER);
1263 const char *content_type = apr_table_get(resp_hdrs, CONTENT_TYPE_HEADER);
1264 if (hostname != NULL && crcsync_similar != NULL && content_type != NULL)
1266 RMM_OFF_T(vary_headers_t) vary_headers;
1267 result = make_vary_headers(ptemp, s, sp_cache, req_hdrs, resp_hdrs, &vary_headers);
1268 if (result != 0) {
1269 ap_log_error(APLOG_MARK, APLOG_WARNING, APR_SUCCESS, s,
1270 "Could not allocate memory to cache vary headers");
1271 continue; // Skip this file
1273 result = add_cached_page(sp_cache, crcsync_similar, hostname, basepath, cache_info->uri, content_type, vary_headers);
1274 if (result == 0)
1276 ap_log_error(APLOG_MARK, APLOG_DEBUG, APR_SUCCESS, s,
1277 "Successfully added file %s to 'find similar page' cache (host: %s, content-type: %s, regex: %s, uri: %s)",
1278 basepath, hostname, content_type, crcsync_similar, cache_info->uri);
1280 else
1282 ap_log_error(APLOG_MARK, APLOG_WARNING, APR_SUCCESS, s,
1283 "Failed to add file %s with regex %s for host %s, content-type %s, uri %s to 'find similar page' cache, result: %d",
1284 basepath, crcsync_similar, hostname, content_type, cache_info->uri, result);
1289 else
1291 ap_log_error(APLOG_MARK, APLOG_WARNING, APR_SUCCESS, s, "Unknown file type %d for file %s/%s",
1292 fileinfo.filetype, abs_dirname, fileinfo.name);
1296 apr_dir_close(dirinfo);
1297 return APR_SUCCESS;
1300 const char *crccache_client_fsp_set_cache_bytes(cmd_parms *parms, void *in_struct_ptr,
1301 const char *arg, similar_page_cache_t *sp_cache)
1303 apr_size_t val = atol(arg);
1304 if (val < 0)
1305 return "CRCClientSharedCacheSize value must be an integer greater than or equal to 0";
1306 sp_cache->cache_bytes = val;
1307 return NULL;
1311 similar_page_cache_t *create_similar_page_cache(apr_pool_t *p)
1313 similar_page_cache_t *sp_cache = apr_pcalloc(p, sizeof(similar_page_cache_t));
1314 if (sp_cache != NULL) {
1315 sp_cache->cache_bytes = 10*1024*1024; // Default to 10 MB
1317 return sp_cache;
1320 static void create_global_mutex(similar_page_cache_t *sp_cache, apr_pool_t *p, apr_pool_t *ptemp, server_rec *s)
1322 apr_status_t result;
1323 result = apr_global_mutex_create(&sp_cache->fs_cache_lock,
1324 sp_cache->lock_file, APR_LOCK_DEFAULT,
1326 if (result != APR_SUCCESS) {
1327 ap_log_error(APLOG_MARK, APLOG_WARNING, result, s,
1328 "Failed to allocate mutex on vhost %s. Similar page cache will only be loaded on start-up but not maintained for new pages cached while the server is running",
1329 format_hostinfo(ptemp, s));
1330 sp_cache->fs_cache_lock = NULL;
1331 return;
1334 #ifdef AP_NEED_SET_MUTEX_PERMS
1335 result = unixd_set_global_mutex_perms(sp_cache->fs_cache_lock);
1336 if (result != APR_SUCCESS) {
1337 ap_log_error(APLOG_MARK, APLOG_WARNING, result, s,
1338 "Failed to set mutex permissions on vhost %s. Similar page cache will only be loaded on start-up but not maintained for new pages cached while the server is running",
1339 format_hostinfo(ptemp, s));
1340 apr_global_mutex_destroy(sp_cache->fs_cache_lock);
1341 sp_cache->fs_cache_lock = NULL;
1342 return;
1344 #endif
1346 // Lock is available for all threads/subprocesses
1347 *APR_RMM_ADDR_GET(int, sp_cache->rmm, sp_cache->lock_is_available)=1;
1350 int crccache_client_fsp_post_config_per_virtual_host(apr_pool_t *p, apr_pool_t *plog,
1351 apr_pool_t *ptemp, server_rec *s, similar_page_cache_t *sp_cache, const char *cache_root)
1353 apr_status_t result;
1356 * Set-up the shared memory block and the mutex for the 'find similar page' memory cache
1359 // Need to know the CacheRootClient value in order to make the SHM
1360 // cache backing file and the mutex lock backing file
1362 const char *cache_file_tmp = apr_pstrcat(ptemp, cache_root, "/crccache_client_shm", NULL);
1363 const char *lock_file_tmp = apr_pstrcat(ptemp, cache_file_tmp, ".lck", NULL);
1364 void *data;
1365 const char *userdata_key = apr_pstrcat(p, "crccache_client_init:", cache_root, NULL);
1367 /* util_crccache_client_post_config() will be called twice. Don't bother
1368 * going through all of the initialization on the first call
1369 * because it will just be thrown away.*/
1370 apr_pool_userdata_get(&data, userdata_key, s->process->pool);
1371 ap_log_error(APLOG_MARK, APLOG_DEBUG, APR_SUCCESS, s,"vhost %s, data=%s",
1372 format_hostinfo(ptemp, s),
1373 data == NULL ? "null" : "not null");
1374 if (!data) {
1375 // This code-block is only executed on first invocation of post_config
1376 apr_pool_userdata_set((const void *)1, userdata_key,
1377 apr_pool_cleanup_null, s->process->pool);
1378 #if APR_HAS_SHARED_MEMORY
1379 /* If the lock file already exists then delete it. Otherwise we are
1380 * going to run into problems creating the shared memory mutex. */
1381 if (lock_file_tmp) {
1382 apr_file_remove(lock_file_tmp, ptemp);
1384 #endif
1385 return OK;
1389 // Below code-block is only executed on second invocation of post_config
1390 sp_cache->cache_root = cache_root;
1391 sp_cache->cache_root_len = strlen(cache_root);
1392 sp_cache->cache_file = apr_pstrdup(p, cache_file_tmp);
1393 sp_cache->lock_file = apr_pstrdup(p, lock_file_tmp);
1395 #if APR_HAS_SHARED_MEMORY
1396 /* initializing cache if we don't have shm address
1398 if (!sp_cache->shm) {
1399 #endif
1400 /* initializing cache if shared memory size or entries is not zero
1402 if (sp_cache->cache_bytes > 0) {
1403 result = similar_page_cache_init(p, s, sp_cache);
1404 if (result != APR_SUCCESS) {
1405 ap_log_error(APLOG_MARK, APLOG_ERR, result, s,
1406 "Could not initialize in-memory cache to efficiently find similar pages on vhost %s. Find similar page functionality is disabled",
1407 format_hostinfo(ptemp, s));
1408 return DONE;
1411 create_global_mutex(sp_cache, p, ptemp, s);
1413 result = similar_page_cache_load(ptemp, s, sp_cache->cache_root, "", sp_cache);
1414 if (result != APR_SUCCESS) {
1415 ap_log_error(APLOG_MARK, APLOG_ERR, result, s,
1416 "Failed to load data into in-memory cache to efficiently find similar pages on vhost %s. Find similar page functionality is disabled",
1417 format_hostinfo(ptemp, s));
1418 return result;
1421 sp_cache->similar_pages_regexs = apr_hash_make(p); // Set-up cache for compiled regular expressions for similar page lookup
1422 sp_cache->similar_pages_cache_initialized = 1; // Similar page cache has finally been successfully set-up and is ready to be used
1424 ap_log_error(APLOG_MARK, APLOG_DEBUG, APR_SUCCESS, s,
1425 "Successfully initialized shared memory cache for this context (%s)",
1426 format_hostinfo(ptemp, s));
1428 else {
1429 ap_log_error(APLOG_MARK, APLOG_INFO, APR_SUCCESS, s,
1430 "CRCCacheClientSharedCacheSize is zero on vhost %s. Find similar page functionality is disabled",
1431 format_hostinfo(ptemp, s));
1433 #if APR_HAS_SHARED_MEMORY
1435 else
1437 ap_log_error(APLOG_MARK, APLOG_DEBUG, 0, s,
1438 "vhost (%s): Weird. Shared memory cache is already initialized for this context",
1439 format_hostinfo(ptemp, s));
1441 #endif
1442 return OK;
1445 void crccache_client_fsp_child_init_per_virtual_host(apr_pool_t *p, server_rec *s, similar_page_cache_t *sp_cache)
1447 apr_status_t sts;
1449 ap_log_error(APLOG_MARK, APLOG_DEBUG, APR_SUCCESS, s,
1450 "mod_crccache_client.child_init_per_vhost (%s): cache_lock: %s",
1451 format_hostinfo(p, s),
1452 sp_cache->fs_cache_lock ? "defined" : "empty");
1454 if (sp_cache->fs_cache_lock)
1456 sts = apr_global_mutex_child_init(&sp_cache->fs_cache_lock,
1457 sp_cache->lock_file, p);
1458 if (sts != APR_SUCCESS) {
1459 ap_log_error(APLOG_MARK, APLOG_WARNING, sts, s,
1460 "Failed to initialise global mutex %s in child process %" APR_PID_T_FMT ". The similar page cache will not be maintained for newly cached pages",
1461 sp_cache->lock_file, getpid());
1462 sp_cache->fs_cache_lock = NULL; // Disable the global mutex in this child process
1463 *APR_RMM_ADDR_GET(int, sp_cache->rmm, sp_cache->lock_is_available) = 0; // Disable global mutex in all child processes
1465 else
1467 ap_log_error(APLOG_MARK, APLOG_DEBUG, sts, s,
1468 "Successfully initialized global mutex %s in child process %" APR_PID_T_FMT ".",
1469 sp_cache->lock_file, getpid());
1474 void update_or_add_similar_page(disk_cache_object_t *dobj, request_rec *r, similar_page_cache_t *sp_cache)
1476 if (!is_lock_available(sp_cache)) {
1477 return; // Lock is not available. Can't start doing updates
1480 if (strlen(dobj->hdrsfile)+1 < sp_cache->cache_root_len ||
1481 memcmp(dobj->hdrsfile, sp_cache->cache_root, sp_cache->cache_root_len) ||
1482 dobj->hdrsfile[sp_cache->cache_root_len] != '/') {
1483 ap_log_error(APLOG_MARK, APLOG_DEBUG, APR_EGENERAL, r->server,
1484 "FIXME: Header file name %s does not start with cache root path %s while it should",
1485 dobj->hdrsfile, sp_cache->cache_root);
1486 return;
1488 char *basepath = apr_pstrdup(r->pool, dobj->hdrsfile+sp_cache->cache_root_len+1);
1489 apr_size_t suffix_len=strlen(CACHE_HEADER_SUFFIX);
1490 apr_size_t basepath_len = strlen(basepath);
1491 if (basepath_len < suffix_len || memcmp(basepath+(basepath_len-suffix_len), CACHE_HEADER_SUFFIX, suffix_len)) {
1492 ap_log_error(APLOG_MARK, APLOG_DEBUG, APR_EGENERAL, r->server,
1493 "FIXME: Header file name %s does not end on %s suffix",
1494 dobj->hdrsfile, CACHE_HEADER_SUFFIX);
1495 return;
1498 *(basepath+(basepath_len-suffix_len)) = 0; // Terminate the suffix location
1500 const char *hostname = apr_table_get(r->headers_in, HOST_HEADER);
1501 const char *crcsync_similar = apr_table_get(r->headers_out, CRCSYNC_SIMILAR_HEADER);
1502 const char *content_type = apr_table_get(r->headers_out, CONTENT_TYPE_HEADER);
1503 if (hostname != NULL && crcsync_similar != NULL && content_type != NULL)
1505 apr_status_t lockrslt = apr_global_mutex_lock(sp_cache->fs_cache_lock);
1506 if (lockrslt != APR_SUCCESS)
1508 ap_log_error(APLOG_MARK, APLOG_WARNING, lockrslt, r->server, "Can't obtain the lock");
1509 return;
1511 RMM_OFF_T(vary_headers_t) vary_headers;
1512 int addrslt = make_vary_headers(r->pool, r->server, sp_cache, r->headers_in, r->headers_out, &vary_headers);
1513 if (addrslt != 0)
1515 ap_log_error(APLOG_MARK, APLOG_WARNING, APR_SUCCESS, r->server,
1516 "Could not allocate memory to cache vary headers");
1518 else
1520 addrslt = add_cached_page(sp_cache, crcsync_similar, hostname, basepath, dobj->name, content_type, vary_headers);
1521 if (addrslt == 0)
1523 ap_log_error(APLOG_MARK, APLOG_DEBUG, APR_SUCCESS, r->server,
1524 "Successfully added file %s to 'find similar page' cache (host: %s, content-type: %s, regex: %s, uri: %s)",
1525 basepath, hostname, content_type, crcsync_similar, dobj->name);
1527 else
1529 ap_log_error(APLOG_MARK, APLOG_WARNING, APR_SUCCESS, r->server,
1530 "Failed to add file %s with regex %s for host %s, content-type %s, uri %s to 'find similar page' cache, result: %d",
1531 basepath, crcsync_similar, hostname, content_type, dobj->name, addrslt);
1534 lockrslt = apr_global_mutex_unlock(sp_cache->fs_cache_lock);
1535 if (lockrslt != APR_SUCCESS)
1537 ap_log_error(APLOG_MARK, APLOG_WARNING, lockrslt, r->server, "Can't release the lock");