blob: cc0185270d483ee2e2577237bbd4127b4c24f5a5 [file] [log] [blame]
/*
* HTML Entity & Encoding normalization.
*
* Copyright (C) 2006-2007 Török Edvin <edwin@clamav.net>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
* MA 02110-1301, USA.
*
*/
#include <clamav-config.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include "cltypes.h"
#include "clamav.h"
#include "others.h"
#include "hashtab.h"
static const size_t prime_list[] =
{
53ul, 97ul, 193ul, 389ul, 769ul,
1543ul, 3079ul, 6151ul, 12289ul, 24593ul,
49157ul, 98317ul, 196613ul, 393241ul, 786433ul,
1572869ul, 3145739ul, 6291469ul, 12582917ul, 25165843ul,
50331653ul, 100663319ul, 201326611ul, 402653189ul, 805306457ul,
1610612741ul, 3221225473ul
};
static const size_t prime_n = sizeof(prime_list)/sizeof(prime_list[0]);
static unsigned char DELETED_KEY[] = "";
static size_t get_nearest_capacity(const size_t capacity)
{
size_t i;
for(i=0 ;i < prime_n; i++) {
if (prime_list[i] > capacity)
return prime_list[i];
}
cli_errmsg("Requested hashtable size is too big!");
return prime_list[prime_n-1];
}
#ifdef PROFILE_HASHTABLE
/* I know, this is ugly, most of these functions get a const s, that gets its const-ness discarded,
* and then these functions modify something the compiler assumes is readonly.
* Please, never use PROFILE_HASHTABLE in production code, and in releases. Use it for development only!*/
static inline void PROFILE_INIT(struct hashtable *s)
{
memset(&s->PROFILE_STRUCT,0,sizeof(s->PROFILE_STRUCT));
}
static inline void PROFILE_CALC_HASH(struct hashtable *s)
{
s->PROFILE_STRUCT.calc_hash++;
}
static inline void PROFILE_FIND_ELEMENT(struct hashtable *s)
{
s->PROFILE_STRUCT.find_req++;
}
static inline void PROFILE_FIND_NOTFOUND(struct hashtable *s, size_t tries)
{
s->PROFILE_STRUCT.not_found++;
s->PROFILE_STRUCT.not_found_tries += tries;
}
static inline void PROFILE_FIND_FOUND(struct hashtable *s, size_t tries)
{
s->PROFILE_STRUCT.found++;
s->PROFILE_STRUCT.found_tries += tries;
}
static inline void PROFILE_HASH_EXHAUSTED(struct hashtable *s)
{
s->PROFILE_STRUCT.hash_exhausted++;
}
static inline void PROFILE_GROW_START(struct hashtable *s)
{
s->PROFILE_STRUCT.grow++;
}
static inline void PROFILE_GROW_FOUND(struct hashtable *s, size_t tries)
{
s->PROFILE_STRUCT.grow_found++;
s->PROFILE_STRUCT.grow_found_tries += tries;
}
static inline void PROFILE_GROW_DONE(struct hashtable *s)
{
}
static inline void PROFILE_DELETED_REUSE(struct hashtable *s, size_t tries)
{
s->PROFILE_STRUCT.deleted_reuse++;
s->PROFILE_STRUCT.deleted_tries += tries;
}
static inline void PROFILE_INSERT(struct hashtable *s, size_t tries)
{
s->PROFILE_STRUCT.inserts++;
s->PROFILE_STRUCT.insert_tries += tries;
}
static inline void PROFILE_DATA_UPDATE(struct hashtable *s, size_t tries)
{
s->PROFILE_STRUCT.update++;
s->PROFILE_STRUCT.update_tries += tries;
}
static inline void PROFILE_HASH_DELETE(struct hashtable *s)
{
s->PROFILE_STRUCT.deletes++;
}
static inline void PROFILE_HASH_CLEAR(struct hashtable *s)
{
s->PROFILE_STRUCT.clear++;
}
static inline void PROFILE_REPORT(const struct hashtable *s)
{
size_t lookups, queries, insert_tries, inserts;
cli_dbgmsg("--------Hashtable usage report for %p--------------\n",(const void*)s);
cli_dbgmsg("hash function calculations:%ld\n",s->PROFILE_STRUCT.calc_hash);
cli_dbgmsg("successfull finds/total searches: %ld/%ld; lookups: %ld\n", s->PROFILE_STRUCT.found, s->PROFILE_STRUCT.find_req, s->PROFILE_STRUCT.found_tries);
cli_dbgmsg("unsuccessfull finds/total searches: %ld/%ld; lookups: %ld\n", s->PROFILE_STRUCT.not_found, s->PROFILE_STRUCT.find_req , s->PROFILE_STRUCT.not_found_tries);
cli_dbgmsg("successfull finds during grow:%ld; lookups: %ld\n",s->PROFILE_STRUCT.grow_found, s->PROFILE_STRUCT.grow_found_tries);
lookups = s->PROFILE_STRUCT.found_tries + s->PROFILE_STRUCT.not_found_tries + s->PROFILE_STRUCT.grow_found_tries;
queries = s->PROFILE_STRUCT.find_req + s->PROFILE_STRUCT.grow_found;
cli_dbgmsg("Find Lookups/total queries: %ld/%ld = %3f\n", lookups, queries, lookups*1.0/queries);
insert_tries = s->PROFILE_STRUCT.insert_tries + s->PROFILE_STRUCT.update_tries + s->PROFILE_STRUCT.deleted_tries;
cli_dbgmsg("new item insert tries/new items: %ld/%ld\n", s->PROFILE_STRUCT.insert_tries, s->PROFILE_STRUCT.inserts);
cli_dbgmsg("update tries/updates: %ld/%ld\n", s->PROFILE_STRUCT.update_tries, s->PROFILE_STRUCT.update);
cli_dbgmsg("deleted item reuse tries/deleted&reused items: %ld/%ld\n", s->PROFILE_STRUCT.deleted_tries, s->PROFILE_STRUCT.deleted_reuse);
inserts = s->PROFILE_STRUCT.inserts + s->PROFILE_STRUCT.update + s->PROFILE_STRUCT.deleted_reuse;
cli_dbgmsg("Insert tries/total inserts: %ld/%ld = %3f\n", insert_tries, inserts, insert_tries*1.0/inserts);
cli_dbgmsg("Grows: %ld, Deletes : %ld, hashtable clears: %ld\n",s->PROFILE_STRUCT.grow,s->PROFILE_STRUCT.deletes, s->PROFILE_STRUCT.clear);
cli_dbgmsg("--------Report end-------------\n");
}
#else
#define PROFILE_INIT(s)
#define PROFILE_CALC_HASH(s)
#define PROFILE_FIND_ELEMENT(s)
#define PROFILE_FIND_NOTFOUND(s, tries)
#define PROFILE_FIND_FOUND(s, tries)
#define PROFILE_HASH_EXHAUSTED(s)
#define PROFILE_GROW_START(s)
#define PROFILE_GROW_FOUND(s, tries)
#define PROFILE_GROW_DONE(s)
#define PROFILE_DELETED_REUSE(s, tries)
#define PROFILE_INSERT(s, tries)
#define PROFILE_DATA_UPDATE(s, tries)
#define PROFILE_HASH_DELETE(s)
#define PROFILE_HASH_CLEAR(s)
#define PROFILE_REPORT(s)
#endif
int hashtab_init(struct hashtable *s,size_t capacity)
{
if(!s)
return CL_ENULLARG;
PROFILE_INIT(s);
capacity = get_nearest_capacity(capacity);
s->htable = cli_calloc(capacity,sizeof(*s->htable));
if(!s->htable)
return CL_EMEM;
s->capacity = capacity;
s->used = 0;
s->maxfill = 8*capacity/10;
return 0;
}
static size_t hash(const unsigned char* k,const size_t len,const size_t SIZE)
{
size_t Hash = 0;
size_t i;
for(i=len;i>0;i--)
Hash = ((Hash << 8) + k[i-1]) % SIZE;
return Hash;
}
/* if returned element has key==NULL, then key was not found in table */
struct element* hashtab_find(const struct hashtable *s,const unsigned char* key,const size_t len)
{
struct element* element;
size_t tries = 1;
size_t idx;
if(!s)
return NULL;
PROFILE_CALC_HASH(s);
PROFILE_FIND_ELEMENT(s);
idx = hash(key, len, s->capacity);
element = &s->htable[idx];
do {
if(!element->key) {
PROFILE_FIND_NOTFOUND(s, tries);
return NULL; /* element not found, place is empty*/
}
else if(element->key != DELETED_KEY && strncmp((const char*)key,(const char*)element->key,len)==0) {
PROFILE_FIND_FOUND(s, tries);
return element;/* found */
}
else {
idx = (idx + tries++) % s->capacity;
element = &s->htable[idx];
}
} while (tries <= s->capacity);
PROFILE_HASH_EXHAUSTED(s);
return NULL; /* not found */
}
static int hashtab_grow(struct hashtable *s)
{
const size_t new_capacity = get_nearest_capacity(s->capacity);
struct element* htable = cli_calloc(new_capacity, sizeof(*s->htable));
size_t i,idx, used = 0;
if(new_capacity == s->capacity || !htable)
return CL_EMEM;
PROFILE_GROW_START(s);
cli_dbgmsg("hashtab.c: Warning: growing open-addressing hashtables is slow. Either allocate more storage when initializing, or use other hashtable types!\n");
for(i=0; i < s->capacity;i++) {
if(s->htable[i].key && s->htable[i].key != DELETED_KEY) {
struct element* element;
size_t tries = 1;
PROFILE_CALC_HASH(s);
idx = hash(s->htable[i].key, strlen((const char*)s->htable[i].key), new_capacity);
element = &htable[idx];
while(element->key && tries <= new_capacity) {
idx = (idx + tries++) % new_capacity;
element = &htable[idx];
}
if(!element->key) {
/* copy element from old hashtable to new */
PROFILE_GROW_FOUND(s, tries);
*element = s->htable[i];
used++;
}
else {
cli_errmsg("hashtab.c: Impossible - unable to rehash table");
return CL_EMEM;/* this means we didn't find enough room for all elements in the new table, should never happen */
}
}
}
free(s->htable);
s->htable = htable;
s->used = used;
s->capacity = new_capacity;
s->maxfill = new_capacity*8/10;
cli_dbgmsg("Table %p size after grow:%ld\n",(void*)s,s->capacity);
PROFILE_GROW_DONE(s);
return CL_SUCCESS;
}
int hashtab_insert(struct hashtable *s,const unsigned char* key,const size_t len,const element_data data)
{
struct element* element;
struct element* deleted_element = NULL;
size_t tries = 1;
size_t idx;
if(!s)
return CL_ENULLARG;
do {
PROFILE_CALC_HASH(s);
idx = hash(key, len, s->capacity);
element = &s->htable[idx];
do {
if(!element->key) {
unsigned char* thekey;
/* element not found, place is empty, insert*/
if(deleted_element) {
/* reuse deleted elements*/
element = deleted_element;
PROFILE_DELETED_REUSE(s, tries);
}
else {
PROFILE_INSERT(s, tries);
}
thekey = cli_malloc(len+1);
if(!thekey)
return CL_EMEM;
strncpy((char*)thekey,(const char*)key,len+1);
element->key = thekey;
element->data = data;
s->used++;
if(s->used > s->maxfill) {
cli_dbgmsg("hashtab.c:Growing hashtable %p, because it has exceeded maxfill, old size:%ld\n",(void*)s,s->capacity);
hashtab_grow(s);
}
return 0;
}
else if(element->key == DELETED_KEY) {
deleted_element = element;
}
else if(strncmp((const char*)key,(const char*)element->key,len)==0) {
PROFILE_DATA_UPDATE(s, tries);
element->data = data;/* key found, update */
return 0;
}
else {
idx = (idx + tries++) % s->capacity;
element = &s->htable[idx];
}
} while (tries <= s->capacity);
/* no free place found*/
PROFILE_HASH_EXHAUSTED(s);
cli_dbgmsg("hashtab.c: Growing hashtable %p, because its full, old size:%ld.\n",(void*)s,s->capacity);
} while( hashtab_grow(s) >= 0 );
cli_warnmsg("hashtab.c: Unable to grow hashtable\n");
return CL_EMEM;
}
void hashtab_delete(struct hashtable *s,const unsigned char* key,const size_t len)
{
struct element* e = hashtab_find(s,key,len);
if(e && e->key) {
PROFILE_HASH_DELETE(s);
free(e->key);/*FIXME: any way to shut up warnings here? if I make key char*, I get tons of warnings in entitylist.h */
e->key = DELETED_KEY;
s->used--;
}
}
void hashtab_clear(struct hashtable *s)
{
size_t i;
PROFILE_HASH_CLEAR(s);
for(i=0;i < s->capacity;i++) {
if(s->htable[i].key && s->htable[i].key != DELETED_KEY)
free(s->htable[i].key);/*FIXME: shut up warnings */
}
memset(s->htable, 0, s->capacity);
s->used = 0;
}
int hashtab_store(const struct hashtable *s,FILE* out)
{
size_t i;
for(i=0; i < s->capacity; i++) {
const struct element* e = &s->htable[i];
if(e->key && e->key != DELETED_KEY) {
fprintf(out,"%ld %s\n",e->data,e->key);
}
}
return CL_SUCCESS;
}
int hashtab_generate_c(const struct hashtable *s,const char* name)
{
size_t i;
printf("/* TODO: include GPL headers */\n");
printf("#include <hashtab.h>\n");
printf("static struct element %s_elements[] = {\n",name);
for(i=0; i < s->capacity; i++) {
const struct element* e = &s->htable[i];
if(!e->key)
printf("\t{NULL, 0},\n");
else if(e->key == DELETED_KEY)
printf("\t{DELETED_KEY,0},\n");
else
printf("\t{(const unsigned char*)\"%s\", %ld},\n", e->key, e->data);
}
printf("};\n");
printf("const struct hashtable %s = {\n",name);
printf("\t%s_elements, %ld, %ld, %ld", name, s->capacity, s->used, s->maxfill);
printf("\n};\n");
PROFILE_REPORT(s);
return 0;
}
int hashtab_load(FILE* in, struct hashtable *s)
{
char line[1024];
while (fgets(line, sizeof(line), in)) {
unsigned char l[1024];
int val;
sscanf(line,"%d %1023s",&val,l);
hashtab_insert(s,l,strlen((const char*)l),val);
}
return CL_SUCCESS;
}