зеркало из
				https://github.com/iharh/notes.git
				synced 2025-11-04 07:36:08 +02:00 
			
		
		
		
	
		
			
				
	
	
		
			775 строки
		
	
	
		
			21 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
			
		
		
	
	
			775 строки
		
	
	
		
			21 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
[special]
 | 
						|
;
 | 
						|
[nostats]
 | 
						|
;
 | 
						|
; Speech grammar rule table
 | 
						|
;
 | 
						|
[attrs]
 | 
						|
; c is a conjunction
 | 
						|
;
 | 
						|
; chs (currently used)
 | 
						|
p(1) + n = 5    ; chs-only
 | 
						|
n + f(1) = 300  ; both
 | 
						|
n + m(1) = 500  ; both
 | 
						|
n(1) + v = 100  ; both
 | 
						|
n + v(1) = 10   ; both
 | 
						|
r + n(1) = 1000 ; both
 | 
						|
r(1) + n = 100  ; both
 | 
						|
d(1) + r = 100  ; both
 | 
						|
d(1) + v = 100  ; both
 | 
						|
v(1) + r = 100  ; both
 | 
						|
n + m(1) = 500  ; both
 | 
						|
v + f(1) = 30   ; both
 | 
						|
v(1) + m = 100  ; both
 | 
						|
v(1) + n = 3    ; =3 for chs, =5 for cht
 | 
						|
a + u(1) = 5    ; both
 | 
						|
v + n(1) = 5    ; both
 | 
						|
u(1) + a = 2    ; chs-only
 | 
						|
ns + *(1) = 10  ; chs-only
 | 
						|
c(1) + * = 50   ; both
 | 
						|
* + c(1) = 50   ; both
 | 
						|
;
 | 
						|
; cht (traditional)
 | 
						|
v(1) + n = 5    ; cht-only
 | 
						|
 | 
						|
 | 
						|
// config.h
 | 
						|
 | 
						|
#define SCWS_ATTR_LEN 20
 | 
						|
 | 
						|
// xdict.h
 | 
						|
 | 
						|
/* data structure for word(12bytes) */
 | 
						|
typedef struct scws_word
 | 
						|
{
 | 
						|
    float tf;
 | 
						|
    float idf;
 | 
						|
    unsigned char flag;
 | 
						|
    char attr[SCWS_ATTR_LEN + 1]; // POS - https://github.com/hightman/scws/blob/master/API.md
 | 
						|
}   word_st, *word_t;
 | 
						|
 | 
						|
 | 
						|
// rule.h
 | 
						|
 | 
						|
#define SCWS_RULE_MAX           32
 | 
						|
#define SCWS_RULE_SPECIAL       0x80000000
 | 
						|
#define SCWS_RULE_NOSTATS       0x40000000
 | 
						|
 | 
						|
/* flag: 0x00 ~ 0x4000 */
 | 
						|
#define SCWS_ZRULE_NONE         0x00
 | 
						|
#define SCWS_ZRULE_PREFIX       0x01
 | 
						|
#define SCWS_ZRULE_SUFFIX       0x02
 | 
						|
#define SCWS_ZRULE_INCLUDE      0x04    /* with include */
 | 
						|
#define SCWS_ZRULE_EXCLUDE      0x08    /* with exclude */
 | 
						|
#define SCWS_ZRULE_RANGE        0x10    /* with znum range */
 | 
						|
 | 
						|
/* data structure */
 | 
						|
typedef struct scws_rule_item            // section
 | 
						|
{
 | 
						|
    short flag;
 | 
						|
    char zmin;
 | 
						|
    char zmax;
 | 
						|
    char name[17];                       // section name
 | 
						|
    char attr[SCWS_ATTR_LEN + 1];        // default - "un"
 | 
						|
    float tf;                            // default - 5.0
 | 
						|
    float idf;                           // default - 3.5
 | 
						|
    unsigned int bit;   /* my bit  */
 | 
						|
    unsigned int inc;   /* include */
 | 
						|
    unsigned int exc;   /* exclude */
 | 
						|
}   *rule_item_t;
 | 
						|
 | 
						|
/* special attrs ratio list(single chain, 12bytes) */
 | 
						|
typedef struct scws_rule_attr *rule_attr_t;
 | 
						|
struct scws_rule_attr
 | 
						|
{
 | 
						|
    char attr1[SCWS_ATTR_LEN + 1];  // ? v1
 | 
						|
    char attr2[SCWS_ATTR_LEN + 1];  // ? v2
 | 
						|
    unsigned char npath[2];         // ?? 0xff - by default,  (1) -> 0, else -> 0xff
 | 
						|
        // used by rule.c 
 | 
						|
        //   - scws_rule_new, 
 | 
						|
        //   - int scws_rule_attr_ratio(rule_t r, const char *attr1, const char *attr2, const unsigned char *npath)
 | 
						|
        //     // get rule attr x
 | 
						|
        //   -- scws.c - _scws_mseg_zone, 
 | 
						|
        //   --- static void _scws_msegment(scws_t s, int end, int zlen)
 | 
						|
        //   ---- scws_res_t scws_get_result(scws_t s)
 | 
						|
 | 
						|
    short ratio;                    // ?   = <num>
 | 
						|
    rule_attr_t next;               // ptr to the next scws_rule_attr
 | 
						|
};
 | 
						|
 | 
						|
typedef struct scws_rule
 | 
						|
{
 | 
						|
    xtree_t tree;
 | 
						|
    rule_attr_t attr;               // [attrs] section: ptr to the first item of the attrs list 
 | 
						|
    struct scws_rule_item items[SCWS_RULE_MAX]; // SCWS_RULE_MAX is 32
 | 
						|
}   rule_st, *rule_t;
 | 
						|
 | 
						|
/* scws ruleset: api */
 | 
						|
 | 
						|
/* create & load ruleset, by fpath & charset */
 | 
						|
rule_t scws_rule_new(const char *fpath, unsigned char *mblen);
 | 
						|
 | 
						|
/* free the memory & resource for ruleset */
 | 
						|
void scws_rule_free(rule_t r);
 | 
						|
 | 
						|
/* get the rule tree record by str */
 | 
						|
rule_item_t scws_rule_get(rule_t r, const char *str, int len);
 | 
						|
 | 
						|
/* check bit */
 | 
						|
int scws_rule_checkbit(rule_t r, const char *str, int len, unsigned int bit);
 | 
						|
 | 
						|
/* get rule attr x */
 | 
						|
int scws_rule_attr_ratio(rule_t r, const char *attr1, const char *attr2, const unsigned char *npath);
 | 
						|
 | 
						|
/* check exclude or include */
 | 
						|
int scws_rule_check(rule_t r, rule_item_t cr, const char *str, int len);
 | 
						|
 | 
						|
static inline int _rule_index_get(rule_t r, const char *name)
 | 
						|
{
 | 
						|
    int i;
 | 
						|
    for (i = 0; i < SCWS_RULE_MAX; i++)
 | 
						|
    {
 | 
						|
        if (r->items[i].name[0] == '\0') // don't go by the end of the items list
 | 
						|
            break;
 | 
						|
 | 
						|
        if (!strcasecmp(r->items[i].name, name)) // found item with the given name at the items list
 | 
						|
            return i;
 | 
						|
    }
 | 
						|
    return -1;
 | 
						|
}
 | 
						|
 | 
						|
rule_t scws_rule_new(const char *fpath, unsigned char *mblen)
 | 
						|
{
 | 
						|
    FILE *fp;
 | 
						|
    rule_t r;
 | 
						|
    rule_item_t cr;       // cr - current rule item
 | 
						|
    int i, j, rbl, aflag; // rbl - read by line, aflag - attrs-section parsing,
 | 
						|
    rule_attr_t a,rtail;
 | 
						|
    unsigned char buf[512], *str, *ptr, *qtr;
 | 
						|
 | 
						|
    /* loaded or open file failed */
 | 
						|
    if ((fp = fopen(fpath, "r")) == NULL)
 | 
						|
        return NULL;
 | 
						|
 | 
						|
    /* alloc the memory */
 | 
						|
    r = (rule_t) malloc(sizeof(rule_st));
 | 
						|
    memset(r, 0, sizeof(rule_st));
 | 
						|
 | 
						|
    /* quick scan to add the name to list */
 | 
						|
    i = j = rbl = aflag = 0;
 | 
						|
    while (fgets(buf, sizeof(buf)-1, fp))
 | 
						|
    {
 | 
						|
        if (buf[0] != '[' || !(ptr = strchr(buf, ']'))) // we are interested in [sections] only
 | 
						|
            continue;
 | 
						|
 | 
						|
        str = buf + 1;
 | 
						|
        *ptr = '\0';
 | 
						|
        if (ptr == str || (ptr-str) > 15 || !strcasecmp(str, "attrs")) // skip [attrs] section
 | 
						|
            continue;
 | 
						|
 | 
						|
        if (_rule_index_get(r, str) >= 0) // already in the items list
 | 
						|
            continue;
 | 
						|
 | 
						|
        // !!! i is the rule index here from 0 to SCWS_RULE_MAX (== 32)
 | 
						|
 | 
						|
        strcpy(r->items[i].name, str);
 | 
						|
        r->items[i].tf = 5.0;                // default tf is  5.0
 | 
						|
        r->items[i].idf = 3.5;               // default idf is 3.5
 | 
						|
        strncpy(r->items[i].attr, "un", 2);  // default attr is "un"
 | 
						|
 | 
						|
        // special and nostats are just a plain words
 | 
						|
 | 
						|
        if (!strcasecmp(str, "special"))
 | 
						|
            r->items[i].bit = SCWS_RULE_SPECIAL;
 | 
						|
        else if (!strcasecmp(str, "nostats"))
 | 
						|
            r->items[i].bit = SCWS_RULE_NOSTATS;
 | 
						|
        else
 | 
						|
        {
 | 
						|
            r->items[i].bit = (1<<j);
 | 
						|
            j++;
 | 
						|
        }
 | 
						|
 | 
						|
        if (++i >= SCWS_RULE_MAX)  // all section has already been read
 | 
						|
            break;
 | 
						|
    }
 | 
						|
 | 
						|
    // 2-nd pass
 | 
						|
    rewind(fp);
 | 
						|
 | 
						|
    /* load the tree data */
 | 
						|
    if ((r->tree = xtree_new(0, 1)) == NULL)
 | 
						|
    {
 | 
						|
        free(r);
 | 
						|
        return NULL;
 | 
						|
    }
 | 
						|
    cr = NULL;
 | 
						|
    while (fgets(buf, sizeof(buf)-1, fp))
 | 
						|
    {
 | 
						|
        if (buf[0] == ';') // comment
 | 
						|
            continue;
 | 
						|
 | 
						|
        if (buf[0] == '[')
 | 
						|
        {
 | 
						|
            cr = NULL;
 | 
						|
            str = buf + 1;
 | 
						|
            aflag = 0;
 | 
						|
            if ((ptr = strchr(str, ']')) != NULL)
 | 
						|
            {
 | 
						|
                *ptr = '\0';
 | 
						|
                if (!strcasecmp(str, "attrs"))
 | 
						|
                {
 | 
						|
                    aflag = 1;
 | 
						|
                }
 | 
						|
                // all the non [attrs] section - need to be read by line
 | 
						|
                else if ((i = _rule_index_get(r, str)) >= 0)  
 | 
						|
                {
 | 
						|
                    rbl = 1;    /* default read by line = yes */
 | 
						|
                    cr = &r->items[i];
 | 
						|
                }
 | 
						|
            }
 | 
						|
            continue;
 | 
						|
        }
 | 
						|
 | 
						|
        /* attr flag open? */
 | 
						|
        if (aflag == 1)                               // [attrs] section
 | 
						|
        {
 | 
						|
            /* parse the attr line */
 | 
						|
 | 
						|
	    // str - first part (before +)
 | 
						|
	    // ptr - second     (after + before =)
 | 
						|
	    // qtr - third      (after =)
 | 
						|
            
 | 
						|
            str = buf;
 | 
						|
            while (*str == ' ' || *str == '\t') str++;
 | 
						|
            if ((ptr = strchr(str, '+')) == NULL) continue;
 | 
						|
            *ptr++ = '\0';
 | 
						|
            if ((qtr = strchr(ptr, '=')) == NULL) continue;
 | 
						|
            *qtr++ = '\0';
 | 
						|
 | 
						|
            /* create new memory */
 | 
						|
            a = (rule_attr_t) malloc(sizeof(struct scws_rule_attr));
 | 
						|
            memset(a, 0, sizeof(struct scws_rule_attr));
 | 
						|
 | 
						|
            /* get ratio */
 | 
						|
            while(*qtr == ' ' || *qtr == '\t') qtr++;
 | 
						|
            a->ratio = (short) atoi(qtr);
 | 
						|
            if (a->ratio < 1)
 | 
						|
                a->ratio = 1;
 | 
						|
            a->npath[0] = a->npath[1] = 0xff;
 | 
						|
 | 
						|
            /* read attr1 & npath1? */
 | 
						|
            a->attr1[0] = *str++;
 | 
						|
            if (*str && *str != '(' && *str != ' ' && *str != '\t')
 | 
						|
                a->attr1[1] = *str++;
 | 
						|
            while (*str && *str != '(') str++;
 | 
						|
            if (*str == '(')
 | 
						|
            {
 | 
						|
                str++;
 | 
						|
                if ((qtr = strchr(str, ')')) != NULL)
 | 
						|
                {
 | 
						|
                    *qtr = '\0';
 | 
						|
                    a->npath[0] = (unsigned char) atoi(str);
 | 
						|
                    if (a->npath[0] > 0)
 | 
						|
                        a->npath[0]--;
 | 
						|
                    else
 | 
						|
                        a->npath[0] = 0xff;
 | 
						|
                }
 | 
						|
            }
 | 
						|
 | 
						|
            /* read attr2 & npath2? */
 | 
						|
            str = ptr;
 | 
						|
            while (*str == ' ' || *str == '\t') str++;
 | 
						|
            a->attr2[0] = *str++;
 | 
						|
            if (*str && *str != '(' && *str != ' ' && *str != '\t')
 | 
						|
                a->attr2[1] = *str++;
 | 
						|
            while (*str && *str != '(') str++;
 | 
						|
            if (*str == '(')
 | 
						|
            {
 | 
						|
                str++;
 | 
						|
                if ((qtr = strchr(str, ')')) != NULL)
 | 
						|
                {
 | 
						|
                    *qtr = '\0';
 | 
						|
                    a->npath[1] = (unsigned char) atoi(str);
 | 
						|
                    if (a->npath[1] > 0)
 | 
						|
                        a->npath[1]--;
 | 
						|
                    else
 | 
						|
                        a->npath[1] = 0xff;
 | 
						|
                }
 | 
						|
            }
 | 
						|
 | 
						|
            //printf("%c%c(%d)+%c%c(%d)=%d\n", a->attr1[0], a->attr1[1] ? a->attr1[1] : ' ', a->npath[0],
 | 
						|
            //  a->attr2[0], a->attr2[1] ? a->attr2[1] : ' ', a->npath[1], a->ratio);
 | 
						|
 | 
						|
            /* append to the chain list */
 | 
						|
            if (r->attr == NULL)
 | 
						|
                r->attr = rtail = a;
 | 
						|
            else
 | 
						|
            {
 | 
						|
                rtail->next = a;
 | 
						|
                rtail = a;
 | 
						|
            }
 | 
						|
 | 
						|
            continue;
 | 
						|
        }
 | 
						|
 | 
						|
        if (cr == NULL) // cr == NULL for [attrs] section
 | 
						|
            continue;
 | 
						|
 | 
						|
        /* param set: line|znum|include|exclude|type|tf|idf|attr */
 | 
						|
        if (buf[0] == ':')
 | 
						|
        {
 | 
						|
            str = buf + 1;
 | 
						|
            if (!(ptr = strchr(str, '=')))
 | 
						|
                continue;
 | 
						|
            while (*str == ' ' || *str == '\t') str++;
 | 
						|
 | 
						|
            qtr = ptr + 1;
 | 
						|
            while (ptr > str && (ptr[-1] == ' ' || ptr[-1] == '\t')) ptr--;
 | 
						|
            *ptr = '\0';
 | 
						|
            ptr = str;
 | 
						|
            str = qtr;
 | 
						|
            while (*str == ' ' || *str == '\t') str++;
 | 
						|
 | 
						|
            if (!strcmp(ptr, "line"))
 | 
						|
                rbl =  (*str == 'N' || *str == 'n') ? 0 : 1;
 | 
						|
            else if (!strcmp(ptr, "tf"))
 | 
						|
                cr->tf = (float) atof(str);
 | 
						|
            else if (!strcmp(ptr, "idf"))
 | 
						|
                cr->idf = (float) atof(str);
 | 
						|
            else if (!strcmp(ptr, "attr"))
 | 
						|
                strncpy(cr->attr, str, 2);
 | 
						|
            else if (!strcmp(ptr, "znum"))
 | 
						|
            {
 | 
						|
                if ((ptr = strchr(str, ',')) != NULL)
 | 
						|
                {
 | 
						|
                    *ptr++ = '\0';
 | 
						|
                    while (*ptr == ' ' || *ptr == '\t') ptr++;
 | 
						|
                    cr->zmax = atoi(ptr);
 | 
						|
                    cr->flag |= SCWS_ZRULE_RANGE;
 | 
						|
                }
 | 
						|
                cr->zmin = atoi(str);
 | 
						|
            }
 | 
						|
            else if (!strcmp(ptr, "type"))
 | 
						|
            {
 | 
						|
                if (!strncmp(str, "prefix", 6))
 | 
						|
                    cr->flag |= SCWS_ZRULE_PREFIX;
 | 
						|
                else if (!strncmp(str, "suffix", 6))
 | 
						|
                    cr->flag |= SCWS_ZRULE_SUFFIX;
 | 
						|
            }
 | 
						|
            else if (!strcmp(ptr, "include") || !strcmp(ptr, "exclude"))
 | 
						|
            {
 | 
						|
                unsigned int *clude;
 | 
						|
 | 
						|
                if (!strcmp(ptr, "include"))
 | 
						|
                {
 | 
						|
                    clude = &cr->inc;
 | 
						|
                    cr->flag |= SCWS_ZRULE_INCLUDE;
 | 
						|
                }
 | 
						|
                else
 | 
						|
                {
 | 
						|
                    clude = &cr->exc;
 | 
						|
                    cr->flag |= SCWS_ZRULE_EXCLUDE;
 | 
						|
                }
 | 
						|
 | 
						|
                while ((ptr = strchr(str, ',')) != NULL)
 | 
						|
                {
 | 
						|
                    while (ptr > str && (ptr[-1] == '\t' || ptr[-1] == ' ')) ptr--;
 | 
						|
                    *ptr = '\0';
 | 
						|
                    if ((i = _rule_index_get(r, str)) >= 0)
 | 
						|
                        *clude |= r->items[i].bit;
 | 
						|
 | 
						|
                    str = ptr + 1;
 | 
						|
                    while (*str == ' ' || *str == '\t' || *str == ',') str++;
 | 
						|
                }
 | 
						|
 | 
						|
                ptr = strlen(str) + str;
 | 
						|
                while (ptr > str && strchr(" \t\r\n", ptr[-1])) ptr--;
 | 
						|
                *ptr = '\0';
 | 
						|
                if (ptr > str && (i = _rule_index_get(r, str)))
 | 
						|
                    *clude |= r->items[i].bit;
 | 
						|
            }
 | 
						|
            continue;
 | 
						|
        }
 | 
						|
 | 
						|
        /* read the entries */
 | 
						|
        str = buf;
 | 
						|
        while (*str == ' ' || *str == '\t') str++;
 | 
						|
        ptr = str + strlen(str);
 | 
						|
        while (ptr > str && strchr(" \t\r\n", ptr[-1])) ptr--;
 | 
						|
        *ptr = '\0';
 | 
						|
 | 
						|
        /* emptry line */
 | 
						|
        if (ptr == str)
 | 
						|
            continue;
 | 
						|
 | 
						|
        if (rbl) // if read-by-line - xtree_nput(r->tree, cr, ..., str, ptr -str)
 | 
						|
            xtree_nput(r->tree, cr, sizeof(struct scws_rule_item), str, ptr - str);
 | 
						|
        else
 | 
						|
        {
 | 
						|
            while (str < ptr)
 | 
						|
            {
 | 
						|
                j = mblen[(*str)];
 | 
						|
 | 
						|
#ifdef HAVE_NOT_QUIET
 | 
						|
                /* try to check repeat */
 | 
						|
                if ((i = (int) xtree_nget(r->tree, str, j, NULL)) != 0)
 | 
						|
                    fprintf(stderr, "Reapeat word on %s|%s: %.*s\n", cr->name, ((rule_item_t) i)->name, j, str);
 | 
						|
#endif
 | 
						|
 | 
						|
                xtree_nput(r->tree, cr, sizeof(struct scws_rule_item), str, j);
 | 
						|
                str += j;
 | 
						|
            }
 | 
						|
        }
 | 
						|
    }
 | 
						|
    fclose(fp);
 | 
						|
 | 
						|
    /* optimize the tree */
 | 
						|
    xtree_optimize(r->tree);
 | 
						|
    return r;
 | 
						|
}
 | 
						|
 | 
						|
/* get rule attr x */
 | 
						|
#define EQUAL_RULE_ATTR(x,y)    ((y[0]=='*'&&y[1]=='\0') || (strcmp(x,y)==0))
 | 
						|
#define EQUAL_RULE_NPATH(x,y)   ((y[0]==0xff||y[0]==x[0])&&(y[1]==0xff||y[1]==x[1]))
 | 
						|
 | 
						|
// special attrs ratio list(single chain, 12bytes)
 | 
						|
//typedef struct scws_rule_attr *rule_attr_t;
 | 
						|
//struct scws_rule_attr
 | 
						|
//{
 | 
						|
//   char attr1[SCWS_ATTR_LEN + 1];  // v1
 | 
						|
//   char attr2[SCWS_ATTR_LEN + 1];  // v2
 | 
						|
//   unsigned char npath[2];         // 0xff - by default,  (1) -> 0, else -> 0xff
 | 
						|
//   short ratio;                    // <num-ratio>
 | 
						|
//   rule_attr_t next;               // ptr to the next scws_rule_attr
 | 
						|
//};
 | 
						|
 | 
						|
//typedef struct scws_rule {
 | 
						|
//   ...
 | 
						|
//   rule_attr_t attr;     // [attrs] section: ptr to the first item of the attrs list 
 | 
						|
//   ...
 | 
						|
//}  rule_st, *rule_t;
 | 
						|
 | 
						|
int
 | 
						|
scws_rule_attr_ratio(rule_t r, const char *attr1, const char *attr2, const unsigned char *npath)
 | 
						|
{
 | 
						|
    rule_attr_t a;
 | 
						|
    int ret = 1;
 | 
						|
 | 
						|
    if (!r || (a = r->attr) == NULL)
 | 
						|
        return ret; // 1 - if there are no rules or [attrs] section is missing
 | 
						|
    
 | 
						|
    // iterate through the [attrs] section items
 | 
						|
    while (a != NULL)
 | 
						|
    {
 | 
						|
        if (EQUAL_RULE_ATTR(attr1, a->attr1) && EQUAL_RULE_ATTR(attr2, a->attr2) && EQUAL_RULE_NPATH(npath, a->npath))
 | 
						|
        {
 | 
						|
            ret = (int) a->ratio; // found the given attr1/attr2/npath (rule attr1/attr2 could be '*')
 | 
						|
            break;                //   rule npath could be absent (0xff), which is just like a '*'
 | 
						|
        }
 | 
						|
        a = a->next;
 | 
						|
    }
 | 
						|
    return ret; // 1 - if the given attr1/attr2/npath not found
 | 
						|
}
 | 
						|
 | 
						|
#undef EQUAL_RULE_ATTR
 | 
						|
#undef EQUAL_RULE_NPATH
 | 
						|
 | 
						|
 | 
						|
// scws.h
 | 
						|
 | 
						|
/* data structures */
 | 
						|
typedef struct scws_result *scws_res_t;
 | 
						|
struct scws_result
 | 
						|
{
 | 
						|
    int off;                      // src text offset
 | 
						|
    float idf;
 | 
						|
    unsigned char len;            // src text length
 | 
						|
    char attr[SCWS_ATTR_LEN + 1]; // SCWS_ATTR_LEN == 20
 | 
						|
    scws_res_t next;
 | 
						|
};
 | 
						|
 | 
						|
// ...
 | 
						|
 | 
						|
typedef struct
 | 
						|
{
 | 
						|
    xdict_t d;
 | 
						|
    rule_t r;                  // rule
 | 
						|
    unsigned char *mblen;      // ???
 | 
						|
    unsigned int mode;
 | 
						|
    unsigned char *txt;
 | 
						|
    int zis;
 | 
						|
    int len;
 | 
						|
    int off;
 | 
						|
    int wend;
 | 
						|
    scws_res_t res0;
 | 
						|
    scws_res_t res1;
 | 
						|
    word_t **wmap;
 | 
						|
    struct scws_zchar *zmap;
 | 
						|
}   scws_st, *scws_t;
 | 
						|
 | 
						|
// scws.cpp
 | 
						|
 | 
						|
void scws_set_rule(scws_t s, const char *fpath)
 | 
						|
{
 | 
						|
    if (s->r != NULL)
 | 
						|
        scws_rule_free(s->r);
 | 
						|
 | 
						|
    s->r = scws_rule_new(fpath, s->mblen);
 | 
						|
}
 | 
						|
 | 
						|
 | 
						|
/* multibyte segment */
 | 
						|
...
 | 
						|
 | 
						|
static void _scws_mseg_zone(scws_t s, int f, int t)
 | 
						|
{
 | 
						|
    unsigned char *mpath, *npath;
 | 
						|
    word_t **wmap;
 | 
						|
    int x,i,j,m,n,j2,sz;
 | 
						|
    double weight, nweight;
 | 
						|
    char attr1[SCWS_ATTR_LEN];
 | 
						|
 | 
						|
    mpath = npath = NULL;
 | 
						|
    weight = nweight = (double) 0.0;
 | 
						|
 | 
						|
    wmap = s->wmap;
 | 
						|
    j2 = 0;
 | 
						|
    for (x = i = f; i <= t; i++)
 | 
						|
    {
 | 
						|
        j = _scws_mget_word(s, i, (x > i ? x - 1 : t));
 | 
						|
        if (j == i) continue;
 | 
						|
        // skip NR in NR
 | 
						|
        if (j < j2 && strcmp(wmap[i][j]->attr, attr_nr) == 0) continue;
 | 
						|
        if (i > j2 && (wmap[i][j]->flag & SCWS_WORD_USED)) continue;
 | 
						|
 | 
						|
        /* one word only */
 | 
						|
        if (i == f && j == t)
 | 
						|
        {
 | 
						|
            mpath = (unsigned char *) malloc(2);
 | 
						|
            mpath[0] = j - i;
 | 
						|
            mpath[1] = 0xff;
 | 
						|
            break;
 | 
						|
        }
 | 
						|
 | 
						|
        if (i != f && (wmap[i][j]->flag & SCWS_WORD_RULE))
 | 
						|
            continue;
 | 
						|
 | 
						|
        /* create the new path */
 | 
						|
        wmap[i][j]->flag |= SCWS_WORD_USED;
 | 
						|
        nweight = (double) wmap[i][j]->tf * pow(j-i,4);
 | 
						|
 | 
						|
        if (npath == NULL)
 | 
						|
        {
 | 
						|
            npath = (unsigned char *) malloc(t-f+2);
 | 
						|
            memset(npath, 0xff, t-f+2);
 | 
						|
        }
 | 
						|
 | 
						|
        /* lookfor backward */
 | 
						|
        x = sz = 0;
 | 
						|
        memset(attr1, 0, sizeof(attr1));
 | 
						|
        for (m = f; m < i; m = n+1)
 | 
						|
        {
 | 
						|
            n = _scws_mget_word(s, m, i-1);
 | 
						|
            nweight *= wmap[m][n]->tf;
 | 
						|
            npath[x++] = n - m;
 | 
						|
            if (n > m)
 | 
						|
            {
 | 
						|
                nweight *= pow(n-m,4);
 | 
						|
                wmap[m][n]->flag |= SCWS_WORD_USED;
 | 
						|
            }
 | 
						|
            else sz++;
 | 
						|
 | 
						|
            if (attr1[0] != '\0')
 | 
						|
                nweight *= scws_rule_attr_ratio(s->r, attr1, wmap[m][n]->attr, &npath[x-2]);
 | 
						|
            memcpy(attr1, wmap[m][n]->attr, SCWS_ATTR_LEN);
 | 
						|
        }
 | 
						|
 | 
						|
        /* my self */
 | 
						|
        npath[x++] = j - i;
 | 
						|
 | 
						|
        if (attr1[0] != '\0')
 | 
						|
            nweight *= scws_rule_attr_ratio(s->r, attr1, wmap[i][j]->attr, &npath[x-2]);
 | 
						|
        memcpy(attr1, wmap[i][j]->attr, SCWS_ATTR_LEN);
 | 
						|
 | 
						|
        /* lookfor forward */
 | 
						|
        for (m = j+1; m <= t; m = n+1)
 | 
						|
        {
 | 
						|
            n = _scws_mget_word(s, m, t);
 | 
						|
            nweight *= wmap[m][n]->tf;
 | 
						|
            npath[x++] = n - m;
 | 
						|
            if (n > m)
 | 
						|
            {
 | 
						|
                nweight *= pow(n-m,4);
 | 
						|
                wmap[m][n]->flag |= SCWS_WORD_USED;
 | 
						|
            }
 | 
						|
            else sz++;
 | 
						|
 | 
						|
            nweight *= scws_rule_attr_ratio(s->r, attr1, wmap[m][n]->attr, &npath[x-2]);
 | 
						|
            memcpy(attr1, wmap[m][n]->attr, SCWS_ATTR_LEN);
 | 
						|
        }
 | 
						|
 | 
						|
        npath[x] = 0xff;
 | 
						|
        nweight /= pow(x+sz-1,5);
 | 
						|
 | 
						|
        /* draw the path for debug */
 | 
						|
#ifdef HAVE_NOT_QUIET
 | 
						|
        if (s->mode & SCWS_DEBUG)
 | 
						|
        {
 | 
						|
            fprintf(stderr, "PATH by keyword = %.*s, (weight=%.4f):\n",
 | 
						|
                s->zmap[j].end - s->zmap[i].start, s->txt + s->zmap[i].start, nweight);
 | 
						|
            for (x = 0, m = f; (n = npath[x]) != 0xff; x++)
 | 
						|
            {
 | 
						|
                n += m;
 | 
						|
                fprintf(stderr, "%.*s ", s->zmap[n].end - s->zmap[m].start, s->txt + s->zmap[m].start);
 | 
						|
                m = n + 1;
 | 
						|
            }
 | 
						|
            fprintf(stderr, "\n--\n");
 | 
						|
        }
 | 
						|
#endif
 | 
						|
 | 
						|
        j2 = x = j;
 | 
						|
        if (x - i > 1) i--;
 | 
						|
        /* check better path */
 | 
						|
        if (nweight > weight)
 | 
						|
        {
 | 
						|
            unsigned char *swap;
 | 
						|
 | 
						|
            weight = nweight;
 | 
						|
            swap = mpath;
 | 
						|
            mpath = npath;
 | 
						|
            npath = swap;
 | 
						|
        }
 | 
						|
    }
 | 
						|
 | 
						|
    /* set the result, mpath != NULL */
 | 
						|
    if (mpath == NULL)
 | 
						|
        return;
 | 
						|
 | 
						|
    for (x = 0, m = f; (n = mpath[x]) != 0xff; x++)
 | 
						|
    {
 | 
						|
        n += m;
 | 
						|
        _scws_mset_word(s, m, n);
 | 
						|
        m = n + 1;
 | 
						|
    }
 | 
						|
 | 
						|
    /* Ò»¿Ú.070808: memory leak fixed. */
 | 
						|
    if (mpath) free(mpath);
 | 
						|
    if (npath) free(npath);
 | 
						|
}
 | 
						|
 | 
						|
 | 
						|
// xtree.h
 | 
						|
 | 
						|
/* pool required */
 | 
						|
#include "pool.h"
 | 
						|
 | 
						|
/* data structure for Hash+Tree */
 | 
						|
typedef struct tree_node node_st, *node_t;
 | 
						|
struct tree_node
 | 
						|
{
 | 
						|
    char *key;
 | 
						|
    void *value;
 | 
						|
    int vlen;
 | 
						|
    node_t left;
 | 
						|
    node_t right;
 | 
						|
};
 | 
						|
 | 
						|
typedef struct
 | 
						|
{
 | 
						|
    pool_t p;       /* pool for memory manager */
 | 
						|
    int base;       /* base number for hasher (prime number recommend) */
 | 
						|
    int prime;      /* good prime number for hasher */
 | 
						|
    int count;      /* total nodes */
 | 
						|
    node_t *trees;  /* trees [total=prime+1] */
 | 
						|
}   xtree_st, *xtree_t;
 | 
						|
 | 
						|
...
 | 
						|
 | 
						|
// xtree.cpp
 | 
						|
 | 
						|
/* private static functions */
 | 
						|
static int _xtree_hasher(xtree_t xt, const char *s, int len)
 | 
						|
{
 | 
						|
    unsigned int h = xt->base;
 | 
						|
    while (len--)
 | 
						|
    {
 | 
						|
        h += (h<<5);
 | 
						|
        h ^= (unsigned char) s[len];
 | 
						|
        h &= 0x7fffffff;
 | 
						|
    }
 | 
						|
    return (h % xt->prime);
 | 
						|
}
 | 
						|
 | 
						|
static node_t _xtree_node_search(node_t head, node_t **pnode, const char *key, int len)
 | 
						|
{
 | 
						|
    int cmp;
 | 
						|
 | 
						|
    cmp = memcmp(key, head->key, len);
 | 
						|
    if (cmp == 0)
 | 
						|
        cmp = len - strlen(head->key);
 | 
						|
 | 
						|
    if (cmp != 0)
 | 
						|
    {
 | 
						|
        node_t *next;
 | 
						|
 | 
						|
        next = (cmp > 0 ? &head->right : &head->left);
 | 
						|
        if (*next == NULL)
 | 
						|
        {
 | 
						|
            if (pnode != NULL)
 | 
						|
                *pnode = next;
 | 
						|
            return NULL;
 | 
						|
        }
 | 
						|
        return _xtree_node_search(*next, pnode, key, len);
 | 
						|
    }
 | 
						|
    return head;
 | 
						|
}
 | 
						|
 | 
						|
static node_t _xtree_node_find(xtree_t xt, node_t **pnode, const char *key, int len)
 | 
						|
{
 | 
						|
    int i;
 | 
						|
    i = (xt->prime > 1 ? _xtree_hasher(xt, key, len) : 0);
 | 
						|
    if (xt->trees[i] == NULL)
 | 
						|
    {
 | 
						|
        if (pnode != NULL)
 | 
						|
            *pnode = &xt->trees[i];
 | 
						|
        return NULL;
 | 
						|
    }
 | 
						|
    return _xtree_node_search(xt->trees[i], pnode, key, len);
 | 
						|
}
 | 
						|
 | 
						|
/* public functions */
 | 
						|
 | 
						|
...
 | 
						|
 | 
						|
void xtree_nput(xtree_t xt, void *value, int vlen, const char *key, int len)
 | 
						|
{
 | 
						|
    node_t node, *pnode;
 | 
						|
 | 
						|
    if (xt == NULL || key == NULL || len == 0)
 | 
						|
        return;
 | 
						|
 | 
						|
    if ((node = _xtree_node_find(xt, &pnode, key, len)) != NULL)
 | 
						|
    {
 | 
						|
        node->value = value;
 | 
						|
        node->vlen = vlen;
 | 
						|
        return;
 | 
						|
    }
 | 
						|
 | 
						|
    if (value != NULL)
 | 
						|
    {
 | 
						|
        *pnode = node = (node_t) pmalloc(xt->p, sizeof(node_st));
 | 
						|
        node->key = pstrndup(xt->p, key, len);
 | 
						|
        node->value = value;
 | 
						|
        node->vlen = vlen;
 | 
						|
        node->left = NULL;
 | 
						|
        node->right = NULL;
 | 
						|
    }
 | 
						|
}
 | 
						|
 | 
						|
void xtree_put(xtree_t xt, const char *value, const char *key)
 | 
						|
{
 | 
						|
    if (xt != NULL && key != NULL)
 | 
						|
        xtree_nput(xt, (void *) value, value ? strlen(value) : 0, key, strlen(key));
 | 
						|
}
 | 
						|
 |