| 1 |
/*- |
| 2 |
* Copyright (c) 1989, 1993 |
| 3 |
* The Regents of the University of California. All rights reserved. |
| 4 |
* |
| 5 |
* This code is derived from software contributed to Berkeley by |
| 6 |
* Ken Arnold. |
| 7 |
* |
| 8 |
* Redistribution and use in source and binary forms, with or without |
| 9 |
* modification, are permitted provided that the following conditions |
| 10 |
* are met: |
| 11 |
* 1. Redistributions of source code must retain the above copyright |
| 12 |
* notice, this list of conditions and the following disclaimer. |
| 13 |
* 2. Redistributions in binary form must reproduce the above copyright |
| 14 |
* notice, this list of conditions and the following disclaimer in the |
| 15 |
* documentation and/or other materials provided with the distribution. |
| 16 |
* 3. Neither the name of the University nor the names of its contributors |
| 17 |
* may be used to endorse or promote products derived from this software |
| 18 |
* without specific prior written permission. |
| 19 |
* |
| 20 |
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND |
| 21 |
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| 22 |
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
| 23 |
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE |
| 24 |
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
| 25 |
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS |
| 26 |
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
| 27 |
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
| 28 |
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
| 29 |
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
| 30 |
* SUCH DAMAGE. |
| 31 |
*/ |
| 32 |
|
| 33 |
#if 0 |
| 34 |
#ifndef lint |
| 35 |
static const char copyright[] = |
| 36 |
"@(#) Copyright (c) 1989, 1993\n\ |
| 37 |
The Regents of the University of California. All rights reserved.\n"; |
| 38 |
#endif /* not lint */ |
| 39 |
|
| 40 |
#ifndef lint |
| 41 |
static const char sccsid[] = "@(#)strfile.c 8.1 (Berkeley) 5/31/93"; |
| 42 |
#endif /* not lint */ |
| 43 |
#endif |
| 44 |
#include <sys/cdefs.h> |
| 45 |
__FBSDID("$FreeBSD$"); |
| 46 |
|
| 47 |
#include <sys/param.h> |
| 48 |
#include <sys/endian.h> |
| 49 |
#include <ctype.h> |
| 50 |
#include <locale.h> |
| 51 |
#include <stdbool.h> |
| 52 |
#include <stdio.h> |
| 53 |
#include <stdlib.h> |
| 54 |
#include <string.h> |
| 55 |
#include <time.h> |
| 56 |
#include <unistd.h> |
| 57 |
|
| 58 |
#include "strfile.h" |
| 59 |
|
| 60 |
/* |
| 61 |
* This program takes a file composed of strings separated by |
| 62 |
* lines starting with two consecutive delimiting character (default |
| 63 |
* character is '%') and creates another file which consists of a table |
| 64 |
* describing the file (structure from "strfile.h"), a table of seek |
| 65 |
* pointers to the start of the strings, and the strings, each terminated |
| 66 |
* by a null byte. Usage: |
| 67 |
* |
| 68 |
* % strfile [-iorsx] [ -cC ] sourcefile [ datafile ] |
| 69 |
* |
| 70 |
* C - Allow comments marked by a double delimiter at line's beginning |
| 71 |
* c - Change delimiting character from '%' to 'C' |
| 72 |
* s - Silent. Give no summary of data processed at the end of |
| 73 |
* the run. |
| 74 |
* o - order the strings in alphabetic order |
| 75 |
* i - if ordering, ignore case |
| 76 |
* r - randomize the order of the strings |
| 77 |
* x - set rotated bit |
| 78 |
* |
| 79 |
* Ken Arnold Sept. 7, 1978 -- |
| 80 |
* |
| 81 |
* Added ordering options. |
| 82 |
*/ |
| 83 |
|
| 84 |
#define STORING_PTRS (Oflag || Rflag) |
| 85 |
#define CHUNKSIZE 512 |
| 86 |
|
| 87 |
#define ALLOC(ptr, sz) do { \ |
| 88 |
if (ptr == NULL) \ |
| 89 |
ptr = malloc(CHUNKSIZE * sizeof(*ptr)); \ |
| 90 |
else if (((sz) + 1) % CHUNKSIZE == 0) \ |
| 91 |
ptr = realloc(ptr, ((sz) + CHUNKSIZE) * sizeof(*ptr)); \ |
| 92 |
if (ptr == NULL) { \ |
| 93 |
fprintf(stderr, "out of space\n"); \ |
| 94 |
exit(1); \ |
| 95 |
} \ |
| 96 |
} while (0) |
| 97 |
|
| 98 |
typedef struct { |
| 99 |
int first; |
| 100 |
off_t pos; |
| 101 |
} STR; |
| 102 |
|
| 103 |
static char *Infile = NULL, /* input file name */ |
| 104 |
Outfile[MAXPATHLEN] = "", /* output file name */ |
| 105 |
Delimch = '%'; /* delimiting character */ |
| 106 |
|
| 107 |
static int Cflag = false; /* embedded comments */ |
| 108 |
static int Sflag = false; /* silent run flag */ |
| 109 |
static int Oflag = false; /* ordering flag */ |
| 110 |
static int Iflag = false; /* ignore case flag */ |
| 111 |
static int Rflag = false; /* randomize order flag */ |
| 112 |
static int Xflag = false; /* set rotated bit */ |
| 113 |
static uint32_t Num_pts = 0; /* number of pointers/strings */ |
| 114 |
|
| 115 |
static off_t *Seekpts; |
| 116 |
|
| 117 |
static FILE *Sort_1, *Sort_2; /* pointers for sorting */ |
| 118 |
|
| 119 |
static STRFILE Tbl; /* statistics table */ |
| 120 |
|
| 121 |
static STR *Firstch; /* first chars of each string */ |
| 122 |
|
| 123 |
static void add_offset(FILE *, off_t); |
| 124 |
static int cmp_str(const void *, const void *); |
| 125 |
static int stable_collate_range_cmp(int, int); |
| 126 |
static void do_order(void); |
| 127 |
static void getargs(int, char **); |
| 128 |
static void randomize(void); |
| 129 |
static void usage(void); |
| 130 |
|
| 131 |
/* |
| 132 |
* main: |
| 133 |
* Drive the sucker. There are two main modes -- either we store |
| 134 |
* the seek pointers, if the table is to be sorted or randomized, |
| 135 |
* or we write the pointer directly to the file, if we are to stay |
| 136 |
* in file order. If the former, we allocate and re-allocate in |
| 137 |
* CHUNKSIZE blocks; if the latter, we just write each pointer, |
| 138 |
* and then seek back to the beginning to write in the table. |
| 139 |
*/ |
| 140 |
int |
| 141 |
main(int ac, char *av[]) |
| 142 |
{ |
| 143 |
char *sp, *nsp, dc; |
| 144 |
FILE *inf, *outf; |
| 145 |
off_t last_off, pos, *p; |
| 146 |
size_t length; |
| 147 |
int first; |
| 148 |
uint32_t cnt; |
| 149 |
STR *fp; |
| 150 |
static char string[257]; |
| 151 |
|
| 152 |
setlocale(LC_ALL, ""); |
| 153 |
|
| 154 |
getargs(ac, av); /* evalute arguments */ |
| 155 |
dc = Delimch; |
| 156 |
if ((inf = fopen(Infile, "r")) == NULL) { |
| 157 |
perror(Infile); |
| 158 |
exit(1); |
| 159 |
} |
| 160 |
|
| 161 |
if ((outf = fopen(Outfile, "w")) == NULL) { |
| 162 |
perror(Outfile); |
| 163 |
exit(1); |
| 164 |
} |
| 165 |
if (!STORING_PTRS) |
| 166 |
fseek(outf, (long)sizeof(Tbl), SEEK_SET); |
| 167 |
|
| 168 |
/* |
| 169 |
* Write the strings onto the file |
| 170 |
*/ |
| 171 |
|
| 172 |
Tbl.str_longlen = 0; |
| 173 |
Tbl.str_shortlen = 0xffffffff; |
| 174 |
Tbl.str_delim = dc; |
| 175 |
Tbl.str_version = VERSION; |
| 176 |
first = Oflag; |
| 177 |
add_offset(outf, ftello(inf)); |
| 178 |
last_off = 0; |
| 179 |
do { |
| 180 |
sp = fgets(string, 256, inf); |
| 181 |
if (sp == NULL || (sp[0] == dc && sp[1] == '\n')) { |
| 182 |
pos = ftello(inf); |
| 183 |
length = (size_t)(pos - last_off) - |
| 184 |
(sp != NULL ? strlen(sp) : 0); |
| 185 |
last_off = pos; |
| 186 |
if (length == 0) |
| 187 |
continue; |
| 188 |
add_offset(outf, pos); |
| 189 |
if ((size_t)Tbl.str_longlen < length) |
| 190 |
Tbl.str_longlen = length; |
| 191 |
if ((size_t)Tbl.str_shortlen > length) |
| 192 |
Tbl.str_shortlen = length; |
| 193 |
first = Oflag; |
| 194 |
} |
| 195 |
else if (first) { |
| 196 |
for (nsp = sp; !isalnum((unsigned char)*nsp); nsp++) |
| 197 |
continue; |
| 198 |
ALLOC(Firstch, Num_pts); |
| 199 |
fp = &Firstch[Num_pts - 1]; |
| 200 |
if (Iflag && isupper((unsigned char)*nsp)) |
| 201 |
fp->first = tolower((unsigned char)*nsp); |
| 202 |
else |
| 203 |
fp->first = *nsp; |
| 204 |
fp->pos = Seekpts[Num_pts - 1]; |
| 205 |
first = false; |
| 206 |
} |
| 207 |
} while (sp != NULL); |
| 208 |
|
| 209 |
/* |
| 210 |
* write the tables in |
| 211 |
*/ |
| 212 |
|
| 213 |
fclose(inf); |
| 214 |
Tbl.str_numstr = Num_pts - 1; |
| 215 |
|
| 216 |
if (Cflag) |
| 217 |
Tbl.str_flags |= STR_COMMENTS; |
| 218 |
|
| 219 |
if (Oflag) |
| 220 |
do_order(); |
| 221 |
else if (Rflag) |
| 222 |
randomize(); |
| 223 |
|
| 224 |
if (Xflag) |
| 225 |
Tbl.str_flags |= STR_ROTATED; |
| 226 |
|
| 227 |
if (!Sflag) { |
| 228 |
printf("\"%s\" created\n", Outfile); |
| 229 |
if (Num_pts == 2) |
| 230 |
puts("There was 1 string"); |
| 231 |
else |
| 232 |
printf("There were %u strings\n", Num_pts - 1); |
| 233 |
printf("Longest string: %u byte%s\n", Tbl.str_longlen, |
| 234 |
Tbl.str_longlen == 1 ? "" : "s"); |
| 235 |
printf("Shortest string: %u byte%s\n", Tbl.str_shortlen, |
| 236 |
Tbl.str_shortlen == 1 ? "" : "s"); |
| 237 |
} |
| 238 |
|
| 239 |
rewind(outf); |
| 240 |
Tbl.str_version = htobe32(Tbl.str_version); |
| 241 |
Tbl.str_numstr = htobe32(Tbl.str_numstr); |
| 242 |
Tbl.str_longlen = htobe32(Tbl.str_longlen); |
| 243 |
Tbl.str_shortlen = htobe32(Tbl.str_shortlen); |
| 244 |
Tbl.str_flags = htobe32(Tbl.str_flags); |
| 245 |
fwrite((char *)&Tbl, sizeof(Tbl), 1, outf); |
| 246 |
if (STORING_PTRS) { |
| 247 |
for (p = Seekpts, cnt = Num_pts; cnt--; ++p) |
| 248 |
*p = htobe64(*p); |
| 249 |
fwrite(Seekpts, sizeof(*Seekpts), (size_t)Num_pts, outf); |
| 250 |
} |
| 251 |
fclose(outf); |
| 252 |
exit(0); |
| 253 |
} |
| 254 |
|
| 255 |
/* |
| 256 |
* This routine evaluates arguments from the command line |
| 257 |
*/ |
| 258 |
void |
| 259 |
getargs(int argc, char **argv) |
| 260 |
{ |
| 261 |
int ch; |
| 262 |
|
| 263 |
while ((ch = getopt(argc, argv, "Cc:iorsx")) != -1) |
| 264 |
switch(ch) { |
| 265 |
case 'C': /* embedded comments */ |
| 266 |
Cflag++; |
| 267 |
break; |
| 268 |
case 'c': /* new delimiting char */ |
| 269 |
Delimch = *optarg; |
| 270 |
if (!isascii(Delimch)) { |
| 271 |
printf("bad delimiting character: '\\%o\n'", |
| 272 |
(unsigned char)Delimch); |
| 273 |
} |
| 274 |
break; |
| 275 |
case 'i': /* ignore case in ordering */ |
| 276 |
Iflag++; |
| 277 |
break; |
| 278 |
case 'o': /* order strings */ |
| 279 |
Oflag++; |
| 280 |
break; |
| 281 |
case 'r': /* randomize pointers */ |
| 282 |
Rflag++; |
| 283 |
break; |
| 284 |
case 's': /* silent */ |
| 285 |
Sflag++; |
| 286 |
break; |
| 287 |
case 'x': /* set the rotated bit */ |
| 288 |
Xflag++; |
| 289 |
break; |
| 290 |
case '?': |
| 291 |
default: |
| 292 |
usage(); |
| 293 |
} |
| 294 |
argv += optind; |
| 295 |
|
| 296 |
if (*argv) { |
| 297 |
Infile = *argv; |
| 298 |
if (*++argv) |
| 299 |
strcpy(Outfile, *argv); |
| 300 |
} |
| 301 |
if (!Infile) { |
| 302 |
puts("No input file name"); |
| 303 |
usage(); |
| 304 |
} |
| 305 |
if (*Outfile == '\0') { |
| 306 |
strlcpy(Outfile, Infile, sizeof(Outfile)); |
| 307 |
strlcat(Outfile, ".dat", sizeof(Outfile)); |
| 308 |
} |
| 309 |
} |
| 310 |
|
| 311 |
void |
| 312 |
usage(void) |
| 313 |
{ |
| 314 |
fprintf(stderr, |
| 315 |
"strfile [-Ciorsx] [-c char] source_file [output_file]\n"); |
| 316 |
exit(1); |
| 317 |
} |
| 318 |
|
| 319 |
/* |
| 320 |
* add_offset: |
| 321 |
* Add an offset to the list, or write it out, as appropriate. |
| 322 |
*/ |
| 323 |
void |
| 324 |
add_offset(FILE *fp, off_t off) |
| 325 |
{ |
| 326 |
off_t beoff; |
| 327 |
|
| 328 |
if (!STORING_PTRS) { |
| 329 |
beoff = htobe64(off); |
| 330 |
fwrite(&beoff, 1, sizeof(beoff), fp); |
| 331 |
} else { |
| 332 |
ALLOC(Seekpts, Num_pts + 1); |
| 333 |
Seekpts[Num_pts] = off; |
| 334 |
} |
| 335 |
Num_pts++; |
| 336 |
} |
| 337 |
|
| 338 |
/* |
| 339 |
* do_order: |
| 340 |
* Order the strings alphabetically (possibly ignoring case). |
| 341 |
*/ |
| 342 |
void |
| 343 |
do_order(void) |
| 344 |
{ |
| 345 |
uint32_t i; |
| 346 |
off_t *lp; |
| 347 |
STR *fp; |
| 348 |
|
| 349 |
Sort_1 = fopen(Infile, "r"); |
| 350 |
Sort_2 = fopen(Infile, "r"); |
| 351 |
qsort(Firstch, (size_t)Tbl.str_numstr, sizeof(*Firstch), cmp_str); |
| 352 |
i = Tbl.str_numstr; |
| 353 |
lp = Seekpts; |
| 354 |
fp = Firstch; |
| 355 |
while (i--) |
| 356 |
*lp++ = fp++->pos; |
| 357 |
fclose(Sort_1); |
| 358 |
fclose(Sort_2); |
| 359 |
Tbl.str_flags |= STR_ORDERED; |
| 360 |
} |
| 361 |
|
| 362 |
static int |
| 363 |
stable_collate_range_cmp(int c1, int c2) |
| 364 |
{ |
| 365 |
static char s1[2], s2[2]; |
| 366 |
int ret; |
| 367 |
|
| 368 |
s1[0] = c1; |
| 369 |
s2[0] = c2; |
| 370 |
if ((ret = strcoll(s1, s2)) != 0) |
| 371 |
return (ret); |
| 372 |
return (c1 - c2); |
| 373 |
} |
| 374 |
|
| 375 |
/* |
| 376 |
* cmp_str: |
| 377 |
* Compare two strings in the file |
| 378 |
*/ |
| 379 |
int |
| 380 |
cmp_str(const void *s1, const void *s2) |
| 381 |
{ |
| 382 |
const STR *p1, *p2; |
| 383 |
int c1, c2, n1, n2, r; |
| 384 |
|
| 385 |
#define SET_N(nf,ch) (nf = (ch == '\n')) |
| 386 |
#define IS_END(ch,nf) (ch == EOF || (ch == (unsigned char)Delimch && nf)) |
| 387 |
|
| 388 |
p1 = (const STR *)s1; |
| 389 |
p2 = (const STR *)s2; |
| 390 |
|
| 391 |
c1 = (unsigned char)p1->first; |
| 392 |
c2 = (unsigned char)p2->first; |
| 393 |
if ((r = stable_collate_range_cmp(c1, c2)) != 0) |
| 394 |
return (r); |
| 395 |
|
| 396 |
fseeko(Sort_1, p1->pos, SEEK_SET); |
| 397 |
fseeko(Sort_2, p2->pos, SEEK_SET); |
| 398 |
|
| 399 |
n1 = false; |
| 400 |
n2 = false; |
| 401 |
while (!isalnum(c1 = getc(Sort_1)) && c1 != '\0' && c1 != EOF) |
| 402 |
SET_N(n1, c1); |
| 403 |
while (!isalnum(c2 = getc(Sort_2)) && c2 != '\0' && c2 != EOF) |
| 404 |
SET_N(n2, c2); |
| 405 |
|
| 406 |
while (!IS_END(c1, n1) && !IS_END(c2, n2)) { |
| 407 |
if (Iflag) { |
| 408 |
if (isupper(c1)) |
| 409 |
c1 = tolower(c1); |
| 410 |
if (isupper(c2)) |
| 411 |
c2 = tolower(c2); |
| 412 |
} |
| 413 |
if ((r = stable_collate_range_cmp(c1, c2)) != 0) |
| 414 |
return (r); |
| 415 |
SET_N(n1, c1); |
| 416 |
SET_N(n2, c2); |
| 417 |
c1 = getc(Sort_1); |
| 418 |
c2 = getc(Sort_2); |
| 419 |
} |
| 420 |
if (IS_END(c1, n1)) |
| 421 |
c1 = 0; |
| 422 |
if (IS_END(c2, n2)) |
| 423 |
c2 = 0; |
| 424 |
|
| 425 |
return (stable_collate_range_cmp(c1, c2)); |
| 426 |
} |
| 427 |
|
| 428 |
/* |
| 429 |
* randomize: |
| 430 |
* Randomize the order of the string table. We must be careful |
| 431 |
* not to randomize across delimiter boundaries. All |
| 432 |
* randomization is done within each block. |
| 433 |
*/ |
| 434 |
void |
| 435 |
randomize(void) |
| 436 |
{ |
| 437 |
uint32_t cnt, i; |
| 438 |
off_t tmp; |
| 439 |
off_t *sp; |
| 440 |
|
| 441 |
Tbl.str_flags |= STR_RANDOM; |
| 442 |
cnt = Tbl.str_numstr; |
| 443 |
|
| 444 |
/* |
| 445 |
* move things around randomly |
| 446 |
*/ |
| 447 |
|
| 448 |
for (sp = Seekpts; cnt > 0; cnt--, sp++) { |
| 449 |
i = arc4random_uniform(cnt); |
| 450 |
tmp = sp[0]; |
| 451 |
sp[0] = sp[i]; |
| 452 |
sp[i] = tmp; |
| 453 |
} |
| 454 |
} |