[apparmor] [PATCH] Add compressed dfa matching routines to library, and a base test program

Mon Jan 11 11:14:22 UTC 2016

On 01/08/2016 08:22 PM, Seth Arnold wrote:
> On Fri, Jan 08, 2016 at 03:48:06PM -0800, John Johansen wrote:
>> Signed-off-by: John Johansen <john.johansen at canonical.com>
> 
> This is pretty cool! A few comments inline...
> 
>> +++ b/devtools/README
> 
>> +    ./test_re -t -c 10000000 -E expr.txt -M match.txt
>> +    elapsed time: 8.421s
>> +
>> +to generate an hfa
>> +    ./test_r -o hfa.file -E expr.txt
> 
> Should probably be ./test_re

yep

> 
>> +print_hfa: simple hfa text dump utility
>> +Eg.
>> +  print_hfa hfa.file
> 
> Should probably be ./print_hfa
> 
sure

> 
>> +++ b/devtools/print_hfa.c
> 
>> +char *load_file(const char *path, ssize_t *fsize)
>> +{
>> +	struct stat stat;
>> +	char *buffer;
>> +	ssize_t remaining;
>> +	int fd, res;
>> +
>> +	fd = open(path, O_RDONLY);
>> +	if (fd == -1) {
>> +		fprintf(stderr, " Error '%s', failed to open '%s'\n", strerror(errno), path);
>> +		goto out;
>> +	}
>> +
>> +	res = fstat(fd, &stat);
>> +	if (res != 0) {
>> +		fprintf(stderr, "Error '%s', could not get size of file '%s'\n", strerror(errno), path);
>> +		close(fd);
>> +		goto out_fd;
>> +	}
> 
> These error strings are funny, I'm more accustomed to seeing the error
> come after the filename; also, glibc has a %m that automatically means

meh, I'm some what in different but it can be changed

> "strerror(errno)". e.g:
> 
>   +		fprintf(stderr, "Failed to open '%s': %m\n", path);
>   +		fprintf(stderr, "Failed to get size of file '%s': %m\n", path);
> 

funny thing, I really don't like %m, I always forget it doesn't take an arg and
then it messes up my looking at the args ...

I can change it but ...

>> +
>> +	buffer = (char *) malloc(stat.st_size + 1);
> 
> No use is made of this terminating NUL in this program; much of this could
> be re-written to use:
> 
> buffer = mmap(NULL, stat.st_size, PROT_READ | MAP_PRIVATE, MAP_POPULATE, fd, 0);
> 
meh, no. at least not yet, I did want to support stdin and a few other things

>> +int main(int argc, char **argv)
>> +{
>> +	const char *err;
>> +	char *chfa;
>> +	ssize_t size;
>> +	int i;
>> +	
>> +	progname = argv[0];
>> +
>> +	if (argc < 2) {
>> +		usage();
>> +		return 1;
>> +	}
>> +
>> +	for (i = optind; i <= argc && argv[i]; i++) {
> 
> What does the i <= argc && argv[i] protect against that i < argc doesn't?
> 
good question, I'm not sure why its there anymore. You have to realize that some
of this code has been sitting around/has changed and evolved for years now. Its
always been in the some day, spare time slow hack away.

The why now is because type splitting needs some of it, and some of the other
coming bits will need testing.

>> +/* To write hfa to a memory buffer do
>> +void print_hfa_to_memory_stream(struct aa_hfa *hfa)
>> +{
>> +	char *bp;
>> +	size_t *size;
>> +
>> +	f = open_memstream(&bp, size);
> 
> There's a bug here, it should read:
> 
>   +	char *bp;
>   +	size_t size; /* note no * */
>   +
yep

>   +	f = open_memstream(&bp, &size);
> 
> This is so cool, I can't believe I haven't seen open_memstream() before.
> 
>> +	fprint_hfa(f, hfa);
>> +	fflush (f);				// update bp and size
>> +	printf ("buf = `%s', size = %d\n", bp, size);
>> +	fprintf (f, ", world");
>> +	fclose (f);				// update bp and size
>> +	printf ("buf = `%s', size = %d\n", bp, size);
>> +	free(bp);				// free stream buffer
>> +}
>> +*/
> 
> 
>> +++ b/devtools/test_re.cc
> 
>> +static int process_args(int argc, char *argv[])
>> +{
>> +	int c, o;
>> +
>> +	while ((c = getopt_long(argc, argv, short_options, long_options, &o)) != -1) {
>> +		long tmp;
>> +		switch (c) {
>> +		case 0:
>> +			fprintf(stderr, "Error in getopt_long handling\n");
>> +			exit(1);
>> +			break;
>> +		case 'h':
>> +			if (!optarg) {
>> +				usage();
>> +			} else if (strcmp(optarg, "Dump") == 0 ||
>> +				   strcmp(optarg, "dump") == 0 ||
>> +				   strcmp(optarg, "D") == 0) {
>> +				display_dump(progname);
>> +			} else if (strcmp(optarg, "Optimize") == 0 ||
>> +				   strcmp(optarg, "optimize") == 0 ||
>> +				   strcmp(optarg, "O") == 0) {
>> +				display_optimize(progname);
>> +			} else {
>> +				fprintf(stderr, "%s: Invalid --help option %s\n",
>> +				       progname, optarg);
>> +				exit(1);
>> +			}
>> +			exit(0);
>> +			break;
>> +		case 't':
>> +			g_print_time = 1;
>> +			break;
>> +		case 'c':	/* repeat count */
>> +			tmp = strtol(optarg, NULL, 0);
>> +			if (tmp < 1)
>> +				fprintf(stderr, "Error bad repeat count '%s'\n",
>> +					optarg);
>> +			else if (tmp == LONG_MAX && errno == ERANGE)
>> +				fprintf(stderr, "Error repeat count '%s' out of range\n", optarg);
>> +			else
>> +				g_repeat_count = tmp;
>> +			break;
>> +		case 's':
>> +			g_sep = optarg;
>> +			break;
>> +		case 'e':	/* expression */
>> +			if (!process_expr_line(optarg))
>> +				fprintf(stderr, "Error processing expr '%s'\n",
>> +					optarg);
>> +			break;
>> +		case 'E':
>> +			if (!process_expr_file(optarg)) {
>> +				fprintf(stderr, "Error processing expr file '%s'\n", optarg);
>> +				exit(1);
>> +			}
>> +			break;
>> +		case 'm':	/* text to match */
>> +			if (!process_match_line(optarg))
>> +				fprintf(stderr, "Error processing match '%s'\n",
>> +					optarg);
>> +			break;
>> +		case 'M':
>> +			if (!process_match_file(optarg)) {
>> +				fprintf(stderr, "Error processing match file '%s'\n", optarg);
>> +				exit(1);
>> +			}
>> +			break;
>> +		case 'o':
>> +			g_out_file = optarg;
>> +			break;
>> +		case 'D':
>> +			if (!optarg) {
>> +				g_dump_info = 1;
>> +				handle_flag_table(dumpflag_table, "stats",
>> +						  &hfaflags);
>> +			} else if (!handle_flag_table(dumpflag_table, optarg,
>> +						      &hfaflags)) {
>> +				fprintf(stderr, "%s: Invalid --Dump option %s\n",
>> +				       progname, optarg);
>> +				exit(1);
>> +			}
>> +			break;
> 
> I think there's a bug here, if handle_flag_table() returns 1, the
> g_dump_info variable won't be set. Is this intentional? It could be fixed
> by moving the g_dump_info up two lines, before the !optarg check.

yep

> 
>> +		case 'O':
>> +			if (!optarg) {
>> +				/* todo: default optimization */
>> +			} else if (!handle_flag_table(optflag_table, optarg,
>> +					       &hfaflags)) {
>> +				fprintf(stderr, "%s: Invalid --Optimize option %s\n",
>> +				       progname, optarg);
>> +				exit(1);
>> +			}
>> +			break;
>> +
>> +		}
>> +
>> +	}
>> +
>> +	return optind;
>> +}
>> +
> 
>> +int parse_entry(const char *line, entry *e)
>> +{
>> +	char *pos;
>> +	unsigned long tmp;
>> +
>> +	while(isspace(*line))
>> +		line++;
>> +	if (*line == 0)
>> +		return -1;
>> +
>> +	tmp = strtoul(line, &pos, 0);
>> +	if ((tmp == ULONG_MAX && errno == ERANGE) || !pos)
>> +	{
>> +		fprintf(stderr, "Error: parsing entry '%s'\n", line);
>> +		return 0;
>> +	}
>> +	e->perm = tmp;
>> +
>> +	while(isspace(*pos))
>> +		pos++;
>> +	e->str = pos;
> 
> I think this may give confusing results if the last line of the file looks
> like:
> 
> 1
> 
> Then e->str is left pointing at an ascii NUL.
> 
yep

>> +/* return the number of failures */
>> +int do_matches(aa_hfa *hfa, unsigned int count)
>> +{
>> +	unsigned int failures = 0;
>> +	clock_t t = clock();
>> +
>> +	for (unsigned int i = 0; i < count; i++) {
>> +		for (std::list<entry>::iterator j = g_matches.begin(); j != g_matches.end(); j++) {
>> +			struct aa_state state;
>> +			aa_hfa_match(hfa, &state, j->str);
>> +			unsigned int accept = aa_hfa_accept(hfa, &state);
>> +			if (accept != j->perm) {
>> +				failures++;
>> +			}
>> +		}
>> +	}
>> +	t = clock() - t;
>> +	unsigned int secs = t/CLOCKS_PER_SEC;
>> +	if (g_print_time)
>> +		printf("elapsed time: %u.%3.3lds\n", secs, (((t % CLOCKS_PER_SEC)*1000+500)/CLOCKS_PER_SEC));
>> +
> 
> I don't understand the +500 here..
> 
its away of forcing rounding

>> +int main(int argc, char **argv)
>> +{
>> +	struct direct_mapped_accept accept;
>> +	const char *err;
>> +
>> +	progname = argv[0];
>> +
>> +	if (argc < 3) {
>> +		usage();
>> +		return 1;
>> +	}
>> +
>> +	g_rules = new aare_rules(0);
>> +	if (!g_rules) {
>> +		fprintf(stderr, "Error failed to create ruleset\n");
>> +		exit(1);
>> +	}
>> +
>> +	optind = process_args(argc, argv);
>> +
>> +	for (int i = optind; i <= argc && argv[i]; i++) {
>> +		if (!process_match_line(argv[i]))
>> +			fprintf(stderr, "Error failed to process match arg '%s'\n", argv[i]);
>> +	}
>> +
>> +	if (!g_matches.size() && !g_out_file) {
>> +		fprintf(stderr, "Error at least one match string must be defined\n");
>> +		exit(1);
>> +	}
>> +
>> +	if (!g_rules->rule_count) {
>> +		fprintf(stderr, "Error at least one Expr pattern must be defined\n");
>> +		exit(1);
>> +	}
>> +	if (g_dump_info) {
>> +		fprintf(stderr, "Processed\n"
>> +			        "  %ld expr entries\n"
>> +			        "  %ld match entries\n",
>> +			(long) g_rules->rule_count, g_matches.size());
>> +		fprintf(stderr, "Creating HFA\n");
>> +	}
>> +	size_t size;
>> +	void *chfa = g_rules->create_dfa(&size, hfaflags);
>> +	if (!chfa) {
>> +		fprintf(stderr, "Error failed to compile hfa\n");
>> +		exit(1);
>> +	}
>> +	delete g_rules;
>> +
>> +	if (g_out_file) {
>> +		if (g_dump_info)
>> +			fprintf(stderr, "Writing HFA to '%s'\n", g_out_file);
>> +		FILE *f = fopen(g_out_file, "w");
>> +		if (!f) {
>> +			fprintf(stderr, "Error failed to dump hfa to '%s'\n", g_out_file);
>> +			exit(1);
> 
> It'd be nice to have the %m to describe why it failed:

does it have to be %m, how about strerror(errno) ;)

> 
>   +			fprintf(stderr, "Error failed to dump hfa to '%s': %m\n", g_out_file);
> 
>> +		}
>> +		fwrite(chfa, size, 1, f);
>> +		fclose(f);
>> +	}
>> +
>> +	if (g_dump_info)
>> +		fprintf(stderr, "Unpacking HFA\n");
>> +	struct aa_hfa *hfa = aa_hfa_unpack(chfa, size, &err, &accept);
>> +	if (!hfa) {
>> +		fprintf(stderr, "Error failed to unpack compressed hfa: %s\n", err);
>> +		exit(1);
>> +	}
>> +	free(chfa);
>> +
>> +	if (g_dump_info)
>> +		fprintf(stderr, "Starting matches\n");
>> +	if (g_matches.size()) {
>> +		int failures = do_matches(hfa, g_repeat_count);
>> +		if (failures) {
>> +			fprintf(stderr, "Error failed to match to expected perms %d times\n", failures);
>> +			exit(2);
>> +		}
>> +	}
>> +
>> +	aa_hfa_free(hfa);
>> +	return 0;
>> +}
> 
>> +++ b/libraries/libapparmor/src/hfa.c
> 
>> + * AppArmor's extended Hybrid Finite Atomata (eHFA) matching engine is
> 
> Automata
> 
yep

>> + * essentially an NFA with additional DFA constraints, with some limited
>> + * function callouts and scratch memory (it is not turing complete nor
> 
> Turing
sure

> 
>> + * even a full PFA).
>> + *
>> + * It is not a true NFA as it does not allow for lambda transitions, nor
>> + * does it allow for an arbitrary set of follow states for any given
>> + * character transition. Instead it should be thought of as a DFA with
>> + * limited NFA states that have constraints such that the number of
>> + * potential NFA states to track in parallel is known in advanced and
> 
> advance
> 
yep

>> + * of a fixed number.
>> + *
>> + * Atm this code only implements the DFA portion of the match and does
>> + * not allow for NFA states nor the extended work memory.
>> + */
> 
>> +/**
>> + * aa_hfa_init_state - initialize state information
>> + * @hfa: hfa to base initialization off of
>> + * @state: state tracking struct to initialize
>> + *
>> + * Returns: 0 on success else error
> 
> Many of these functions cannot return errors, and indeed can't error. It
> might make sense to change the return type to void:
> 
no

so one of the goals here was to make an api that could work with extended
functionality. As noted, this code only implements the DFA portion. However
once we add the eHFA bits (var matching, back references, fn triggers,
limited NFA states to control state explosion), it will be possible to
fail.

Ideally you would have a state buffer setup that would ensure that it
can't but that may not be practical or possible.

I actually like the api used in the old fns in the kernel better but it
is not extensible.

>> + */
>> +int aa_hfa_init_state(struct aa_hfa *hfa, struct aa_state *state)
>> +{
>> +	state->state = AA_HFA_START;
>> +
>> +	return 0;
>> +}
>> +
> 
>> +/**
>> + * aa_hfa_reset - reset a state tracking structure to start a new match
>> + * @hfa: dfa to reset for
>> + * @state: state to reset
>> + *
>> + * Returns: 0 on success else error
> 
> This function can't error:
> 
atm

>> + */
>> +int aa_hfa_reset(struct aa_hfa *hfa, struct aa_state *state)
>> +{
>> +	state->state = AA_HFA_START;
>> +
>> +	return 0;
>> +}
> 
> 
>> +/**
>> + * aa_hfa_matchn_cont - traverse @hfa from @state to find state @str stops at
>> + * @hfa: the hfa to match @str against  (NOT NULL)
>> + * @state: the state of the hfa to start matching in
>> + * @str: the string of bytes to match against the hfa  (NOT NULL)
>> + * @len: length of the string of bytes to match
>> + *
>> + * aa_hfa_matchn will match @str against the hfa and return the state it
>> + * finished matching in. The final state can be used to look up the accepting
>> + * label, or as the start state of a continuing match.
>> + *
>> + * This function will happily match again the 0 byte and only finishes
>> + * when @len input is consumed.
>> + *
>> + * Returns: 0 on success else error.
>> + *
>> + * Note: most matches are guarenteed to succeed, see aa_hfa_reset, and
>> + *       aa_hfa_can_fail.
>> + */
> 
> The old function name aa_hfa_matchn* survived in the comment block; and
yep

> the function can't return anything except 0, so this might be worth
> changing to void too:
> 
still no

>> +int aa_hfa_continuen(struct aa_hfa *hfa, struct aa_state *state,
>> +		     const char *str, size_t len)
>> +{
>> +	unsigned int s = state->state;
>> +
>> +	if (s == 0)
>> +		return 0;
>> +
>> +	/* current state is <state>, matching character *str */
>> +	if (hfa->ec) {
>> +		for (; len; len--)
>> +			match_EC_char(s, hfa, *str++);
>> +	} else {
>> +		for (; len; len--)
>> +			match_char(s, hfa, *str++);
>> +	}
>> +	state->state = s;
>> +
>> +	return 0;
>> +}
>> +
>> +/**
>> + * aa_hfa_continue - traverse @hfa from @state to find state @str stops at
>> + * @hfa: the hfa to match @str against  (NOT NULL)
>> + * @state: the state of the hfa to start matching in
>> + * @str: the null terminated string of bytes to match against the hfa (NOT NULL)
>> + *
>> + * aa_hfa_match will match @str against the hfa and return the state it
>> + * finished matching in. The final state can be used to look up the accepting
>> + * label, or as the start state of a continuing match.
>> + *
>> + * Returns: 0 on success else error.
>> + *
>> + * Note: most matches are guarenteed to succeed, see aa_hfa_start, and
>> + *       aa_hfa_can_fail.
>> + */
> 
> The old function name aa_hfa_match survived in the comment block, and this
yep

> function can't return anything except 0, so it might be worth changing to
> void:
> 
still no

>> +int aa_hfa_continue(struct aa_hfa *hfa, struct aa_state *state, const char *str)
>> +{
>> +	unsigned int s = state->state;
>> +
>> +	if (s == 0)
>> +		return 0;
>> +
>> +	if (hfa->ec) {
>> +		while (*str)
>> +			match_EC_char(s, hfa, *str++);
>> +	} else {
>> +		while (*str)
>> +			match_char(s, hfa, *str++);
>> +	}
>> +	state->state = s;
>> +
>> +	return 0;
>> +}
>> +
> 
>> +/**
>> + * aa_hfa_matchn - reset and traverse @hfa to find state @str stops at
>> + * @hfa: the hfa to match @str against  (NOT NULL)
>> + * @state: the state of the hfa to start matching in
>> + * @str: the string of bytes to match against the hfa  (NOT NULL)
>> + * @len: length of the string of bytes to match
>> + *
>> + * aa_hfa_matchn will match @str against the hfa and return the state it
>> + * finished matching in. The final state can be used to look up the accepting
>> + * label, or as the start state of a continuing match.
>> + *
>> + * This function will happily match again the 0 byte and only finishes
>> + * when @len input is consumed.
>> + *
>> + * Returns: 0 on success else error.
>> + *
>> + * Note: most matches are guarenteed to succeed, see aa_hfa_reset, and
>> + *       aa_hfa_can_fail.
>> + */
>> +int aa_hfa_matchn(struct aa_hfa *hfa, struct aa_state *state, const char *str,
>> +		  size_t len)
>> +{
>> +	int err = aa_hfa_reset(hfa, state);
>> +	if (err)
>> +		return err;
>> +	return aa_hfa_continuen(hfa, state, str, len);
>> +}
>> +
>> +/**
>> + * aa_hfa_match - reset and traverse @hfa to find state @str stops at
>> + * @hfa: the hfa to match @str against  (NOT NULL)
>> + * @state: the state of the hfa to start matching in
>> + * @str: the null terminated string of bytes to match against the hfa (NOT NULL)
>> + *
>> + * aa_hfa_match will match @str against the hfa and return the state it
>> + * finished matching in. The final state can be used to look up the accepting
>> + * label, or as the start state of a continuing match.
>> + *
>> + * Returns: 0 on success else error.
>> + *
>> + * Note: most matches are guarenteed to succeed, see aa_hfa_start, and
>> + *       aa_hfa_can_fail.
>> + */
>> +int aa_hfa_match(struct aa_hfa *hfa, struct aa_state *state, const char *str)
>> +{
>> +	int err = aa_hfa_reset(hfa, state);
>> +	if (err)
>> +		return err;
>> +	return aa_hfa_continue(hfa, state, str);
>> +}
>> +
>> +/**
>> + * aa_hfa_next - step one character to the next state in the hfa
>> + * @hfa: the hfa to tranverse (NOT NULL)
>> + * @state: the state to start in
>> + * @c: the input character to transition on
>> + *
>> + * aa_hfa_match will step through the hfa by one input character @c
>> + *
>> + * Returns: 0 on success else error.
>> + *
>> + * Note: most matches are guarenteed to succeed, see aa_hfa_start, and
>> + *       aa_hfa_can_fail.
>> + */
> 
> This function can't return anything except 0.
> 
you don't give up on asking for a void return easily do you? :P

>> +int aa_hfa_next(struct aa_hfa *hfa, struct aa_state *state, const char c)
>> +{
>> +	if (hfa->ec)
>> +		match_EC_char(state->state, hfa, c);
>> +	else
>> +		match_char(state->state, hfa, c);
>> +
>> +	return 0;
>> +}
> 
> Thanks
> 
> 
>