notes/science/nlp/io.txt
Ihar Hancharenka 5dff80e88e first
2023-03-27 16:52:17 +03:00

279 строки
8.2 KiB
Plaintext

// io.c
fsm_read_binary_handle fsm_read_binary_file_multiple_init(char *filename) {
struct io_buf_handle *iobh;
fsm_read_binary_handle fsm_read_handle;
iobh = io_init();
if (io_gz_file_to_mem(iobh, filename) == 0) {
io_free(iobh);
return NULL;
}
fsm_read_handle = (void *) iobh;
return(fsm_read_handle);
}
struct fsm *fsm_read_binary_file_multiple(fsm_read_binary_handle fsrh) {
char *net_name;
struct fsm *net;
struct io_buf_handle *iobh;
iobh = (struct io_buf_handle *) fsrh;
net = io_net_read(iobh, &net_name);
if (net == NULL) {
io_free(iobh);
return(NULL);
} else {
xxfree(net_name);
return(net);
}
}
/* The file format we use is an extremely simple text format */
/* which is gzip compressed through libz and consists of the following sections: */
/* ##foma-net VERSION##*/
/* ##props## */
/* PROPERTIES LINE */
/* ##sigma## */
/* ...SIGMA LINES... */
/* ##states## */
/* ...TRANSITION LINES... */
/* ##end## */
/* Several networks may be concatenated in one file */
/* The initial identifier is "##foma-net 1.0##" */
/* where 1.0 is the version number for the file format */
/* followed by the line "##props##" */
/* which is followed by a line of space separated integers */
/* which correpond to: */
/* arity arccount statecount linecount finalcount pathcount is_deterministic */
/* is_pruned is_minimized is_epsilon_free is_loop_free is_completed name */
/* where name is used if defined networks are saved/loaded */
/* Following the props line, we accept anything (for future expansion) */
/* until we find ##sigma## */
/* the section beginning with "##sigma##" consists of lines with two fields: */
/* number string */
/* correponding to the symbol number and the symbol string */
/* the section beginning with "##states##" consists of lines of ASCII integers */
/* with 2-5 fields to avoid some redundancy in every line corresponding to a */
/* transition where otherwise state numbers would be unnecessarily repeated and */
/* out symbols also (if in = out as is the case for recognizers/simple automata) */
/* The information depending on the number of fields in the lines is as follows: */
/* 2: in target (here state_no is the same as the last mentioned one and out = in) */
/* 3: in out target (again, state_no is the same as the last mentioned one) */
/* 4: state_no in target final_state (where out = in) */
/* 5: state_no in out target final_state */
/* There is no harm in always using 5 fields; however this will take up more space */
/* As in struct fsm_state, states without transitions are represented as a 4-field: */
/* state_no -1 -1 final_state (since in=out for 4-field lines, out = -1 as well) */
/* AS gzopen will read uncompressed files as well, one can gunzip a file */
/* that contains a network and still read it */
struct fsm *io_net_read(struct io_buf_handle *iobh, char **net_name) {
char buf[READ_BUF_SIZE];
struct fsm *net;
struct fsm_state *fsm;
char *new_symbol;
int i, items, new_symbol_number, laststate, lineint[5], *cm;
int extras;
char last_final = '1';
if (io_gets(iobh, buf) == 0) {
return NULL;
}
net = fsm_create("");
if (strcmp(buf, "##foma-net 1.0##") != 0) {
fsm_destroy(net);
perror("File format error foma!\n");
return NULL;
}
io_gets(iobh, buf);
if (strcmp(buf, "##props##") != 0) {
perror("File format error props!\n");
fsm_destroy(net);
return NULL;
}
/* Properties */
io_gets(iobh, buf);
extras = 0;
sscanf(buf, "%i %i %i %i %i %lld %i %i %i %i %i %i %s", &net->arity, &net->arccount, &net->statecount, &net->linecount, &net->finalcount, &net->pathcount, &net->is_deterministic, &net->is_pruned, &net->is_minimized, &net->is_epsilon_free, &net->is_loop_free, &extras, buf);
strcpy(net->name, buf);
*net_name = xxstrdup(buf);
io_gets(iobh, buf);
net->is_completed = (extras & 3);
net->arcs_sorted_in = (extras & 12) >> 2;
net->arcs_sorted_out = (extras & 48) >> 4;
/* Sigma */
while (strcmp(buf, "##sigma##") != 0) { /* Loop until we encounter ##sigma## */
if (buf[0] == '\0') {
printf("File format error at sigma definition!\n");
fsm_destroy(net);
return NULL;
}
io_gets(iobh, buf);
}
for (;;) {
io_gets(iobh, buf);
if (buf[0] == '#') break;
if (buf[0] == '\0') continue;
new_symbol = strstr(buf, " ");
new_symbol[0] = '\0';
new_symbol++;
if (new_symbol[0] == '\0') {
sscanf(buf,"%i", &new_symbol_number);
sigma_add_number(net->sigma, "\n", new_symbol_number);
} else {
sscanf(buf,"%i", &new_symbol_number);
sigma_add_number(net->sigma, new_symbol, new_symbol_number);
}
}
/* States */
if (strcmp(buf, "##states##") != 0) {
printf("File format error!\n");
return NULL;
}
net->states = xxmalloc(net->linecount*sizeof(struct fsm_state));
fsm = net->states;
laststate = -1;
for (i=0; ;i++) {
io_gets(iobh, buf);
if (buf[0] == '#') break;
/* scanf is just too slow here */
//items = sscanf(buf, "%i %i %i %i %i",&lineint[0], &lineint[1], &lineint[2], &lineint[3], &lineint[4]);
items = explode_line(buf, &lineint[0]);
switch (items) {
case 2:
(fsm+i)->state_no = laststate;
(fsm+i)->in = lineint[0];
(fsm+i)->out = lineint[0];
(fsm+i)->target = lineint[1];
(fsm+i)->final_state = last_final;
break;
case 3:
(fsm+i)->state_no = laststate;
(fsm+i)->in = lineint[0];
(fsm+i)->out = lineint[1];
(fsm+i)->target = lineint[2];
(fsm+i)->final_state = last_final;
break;
case 4:
(fsm+i)->state_no = lineint[0];
(fsm+i)->in = lineint[1];
(fsm+i)->out = lineint[1];
(fsm+i)->target = lineint[2];
(fsm+i)->final_state = lineint[3];
laststate = lineint[0];
last_final = lineint[3];
break;
case 5:
(fsm+i)->state_no = lineint[0];
(fsm+i)->in = lineint[1];
(fsm+i)->out = lineint[2];
(fsm+i)->target = lineint[3];
(fsm+i)->final_state = lineint[4];
laststate = lineint[0];
last_final = lineint[4];
break;
default:
printf("File format error\n");
return NULL;
}
if (laststate > 0) {
(fsm+i)->start_state = 0;
} else if (laststate == -1) {
(fsm+i)->start_state = -1;
} else {
(fsm+i)->start_state = 1;
}
}
if (strcmp(buf, "##cmatrix##") == 0) {
cmatrix_init(net);
cm = net->medlookup->confusion_matrix;
for (;;) {
io_gets(iobh, buf);
if (buf[0] == '#') break;
sscanf(buf,"%i", &i);
*cm = i;
cm++;
}
}
if (strcmp(buf, "##end##") != 0) {
printf("File format error!\n");
return NULL;
}
return(net);
}
static int io_gets(struct io_buf_handle *iobh, char *target) {
int i;
for (i = 0; *((iobh->io_buf_ptr)+i) != '\n' && *((iobh->io_buf_ptr)+i) != '\0'; i++) {
*(target+i) = *((iobh->io_buf_ptr)+i);
}
*(target+i) = '\0';
if (*((iobh->io_buf_ptr)+i) == '\0')
(iobh->io_buf_ptr) = (iobh->io_buf_ptr) + i;
else
(iobh->io_buf_ptr) = (iobh->io_buf_ptr) + i + 1;
return(i);
}
size_t io_gz_file_to_mem(struct io_buf_handle *iobh, char *filename) {
size_t size;
gzFile FILE;
size = io_get_file_size(filename);
if (size == 0) {
return 0;
}
(iobh->io_buf) = xxmalloc((size+1)*sizeof(char));
FILE = gzopen(filename, "rb");
gzread(FILE, iobh->io_buf, size);
gzclose(FILE);
*((iobh->io_buf)+size) = '\0';
iobh->io_buf_ptr = iobh->io_buf;
return(size);
}
struct io_buf_handle *io_init() {
struct io_buf_handle *iobh;
iobh = xxmalloc(sizeof(struct io_buf_handle));
(iobh->io_buf) = NULL;
(iobh->io_buf_ptr) = NULL;
return(iobh);
}
void io_free(struct io_buf_handle *iobh) {
if (iobh->io_buf != NULL) {
xxfree(iobh->io_buf);
(iobh->io_buf) = NULL;
}
xxfree(iobh);
}