SigmaHQ/tools/sigma/parser/condition.py
Thomas Patzke 849a5a520d Conditional field mapping resolve_fieldname now functional
Before this method just had some placeholder function that wasn't really
implementing the intended functionality of the conditional field
mapping. Now aggregations get also conditional field mapping
functionality.
2019-10-09 23:57:41 +02:00

658 lines
26 KiB
Python

# Sigma parser
# Copyright 2016-2017 Thomas Patzke, Florian Roth
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Lesser General Public License for more details.
# You should have received a copy of the GNU Lesser General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import re
from .base import SimpleParser
from .exceptions import SigmaParseError
COND_NONE = 0
COND_AND = 1
COND_OR = 2
COND_NOT = 3
COND_NULL = 4
# Debugging code
def dumpNode(node, indent=''): # pragma: no cover
"""
Recursively print the AST rooted at *node* for debugging.
"""
if hasattr(node, 'items'):
print("%s%s<%s>" % (indent, type(node).__name__,
type(node.items).__name__))
if type(node.items) != list:
dumpNode(node.items, indent + ' ')
else:
for item in node.items:
dumpNode(item, indent + ' ')
else:
print("%s%s=%s" % (indent, type(node).__name__,
repr(node)))
return node
# Condition Tokenizer
class SigmaConditionToken:
"""Token of a Sigma condition expression"""
TOKEN_AND = 1
TOKEN_OR = 2
TOKEN_NOT = 3
TOKEN_ID = 4
TOKEN_LPAR = 5
TOKEN_RPAR = 6
TOKEN_PIPE = 7
TOKEN_ONE = 8
TOKEN_ALL = 9
TOKEN_AGG = 10
TOKEN_EQ = 11
TOKEN_LT = 12
TOKEN_LTE = 13
TOKEN_GT = 14
TOKEN_GTE = 15
TOKEN_BY = 16
TOKEN_NEAR = 17
tokenstr = [
"INVALID",
"AND",
"OR",
"NOT",
"ID",
"LPAR",
"RPAR",
"PIPE",
"ONE",
"ALL",
"AGG",
"EQ",
"LT",
"LTE",
"GT",
"GTE",
"BY",
"NEAR",
]
def __init__(self, tokendef, match, pos):
self.type = tokendef[0]
self.matched = match.group()
self.pos = pos
def __eq__(self, other):
if type(other) == int: # match against type
return self.type == other
if type(other) == str: # match against content
return self.matched == other
else:
raise NotImplementedError("SigmaConditionToken can only be compared against token type constants")
def __str__(self): # pragma: no cover
return "[ Token: %s: '%s' ]" % (self.tokenstr[self.type], self.matched)
class SigmaConditionTokenizer:
"""Tokenize condition string into token sequence"""
tokendefs = [ # list of tokens, preferred recognition in given order, (token identifier, matching regular expression). Ignored if token id == None
(SigmaConditionToken.TOKEN_ONE, re.compile("1 of", re.IGNORECASE)),
(SigmaConditionToken.TOKEN_ALL, re.compile("all of", re.IGNORECASE)),
(None, re.compile("[\\s\\r\\n]+")),
(SigmaConditionToken.TOKEN_AGG, re.compile("count|min|max|avg|sum", re.IGNORECASE)),
(SigmaConditionToken.TOKEN_NEAR, re.compile("near", re.IGNORECASE)),
(SigmaConditionToken.TOKEN_BY, re.compile("by", re.IGNORECASE)),
(SigmaConditionToken.TOKEN_EQ, re.compile("==")),
(SigmaConditionToken.TOKEN_LT, re.compile("<")),
(SigmaConditionToken.TOKEN_LTE, re.compile("<=")),
(SigmaConditionToken.TOKEN_GT, re.compile(">")),
(SigmaConditionToken.TOKEN_GTE, re.compile(">=")),
(SigmaConditionToken.TOKEN_PIPE, re.compile("\\|")),
(SigmaConditionToken.TOKEN_AND, re.compile("and", re.IGNORECASE)),
(SigmaConditionToken.TOKEN_OR, re.compile("or", re.IGNORECASE)),
(SigmaConditionToken.TOKEN_NOT, re.compile("not", re.IGNORECASE)),
(SigmaConditionToken.TOKEN_ID, re.compile("[\\w*]+")),
(SigmaConditionToken.TOKEN_LPAR, re.compile("\\(")),
(SigmaConditionToken.TOKEN_RPAR, re.compile("\\)")),
]
def __init__(self, condition):
if type(condition) == str: # String that is parsed
self.tokens = list()
pos = 1
while len(condition) > 0:
for tokendef in self.tokendefs: # iterate over defined tokens and try to recognize the next one
match = tokendef[1].match(condition)
if match:
if tokendef[0] != None:
self.tokens.append(SigmaConditionToken(tokendef, match, pos + match.start()))
pos += match.end() # increase position and cut matched prefix from condition
condition = condition[match.end():]
break
else: # no valid token identified
raise SigmaParseError("Unexpected token in condition at position %s" % condition)
elif type(condition) == list: # List of tokens to be converted into SigmaConditionTokenizer class
self.tokens = condition
else:
raise TypeError("SigmaConditionTokenizer constructor expects string or list, got %s" % (type(condition)))
def __str__(self): # pragma: no cover
return " ".join([str(token) for token in self.tokens])
def __iter__(self):
return iter(self.tokens)
def __len__(self):
return len(self.tokens)
def __getitem__(self, i):
if type(i) == int:
return self.tokens[i]
elif type(i) == slice:
return SigmaConditionTokenizer(self.tokens[i])
else:
raise IndexError("Expected index or slice")
def __add__(self, other):
if isinstance(other, SigmaConditionTokenizer):
return SigmaConditionTokenizer(self.tokens + other.tokens)
elif isinstance(other, (SigmaConditionToken, ParseTreeNode)):
return SigmaConditionTokenizer(self.tokens + [ other ])
else:
raise TypeError("+ operator expects SigmaConditionTokenizer or token type, got %s: %s" % (type(other), str(other)))
def index(self, item):
return self.tokens.index(item)
### Parse Tree Node Classes ###
class ParseTreeNode:
"""Parse Tree Node Base Class"""
def __init__(self):
raise NotImplementedError("ConditionBase is no usable class")
def __str__(self): # pragma: no cover
return "[ %s: %s ]" % (self.__doc__, str([str(item) for item in self.items]))
class ConditionBase(ParseTreeNode):
"""Base class for conditional operations"""
op = COND_NONE
items = None
def __init__(self):
raise NotImplementedError("ConditionBase is no usable class")
def add(self, item):
self.items.append(item)
def __iter__(self):
return iter(self.items)
def __len__(self):
return len(self.items)
class ConditionAND(ConditionBase):
"""AND Condition"""
op = COND_AND
def __init__(self, sigma=None, op=None, val1=None, val2=None):
if sigma == None and op == None and val1 == None and val2 == None: # no parameters given - initialize empty
self.items = list()
else: # called by parser, use given values
self.items = [ val1, val2 ]
class ConditionOR(ConditionAND):
"""OR Condition"""
op = COND_OR
class ConditionNOT(ConditionBase):
"""NOT Condition"""
op = COND_NOT
def __init__(self, sigma=None, op=None, val=None):
if sigma == None and op == None and val == None: # no parameters given - initialize empty
self.items = list()
else: # called by parser, use given values
self.items = [ val ]
def add(self, item):
if len(self.items) == 0:
super.add(item)
else:
raise ValueError("Only one element allowed")
@property
def item(self):
try:
return self.items[0]
except IndexError:
return None
class ConditionNULLValue(ConditionNOT):
"""Condition: Field value is empty or doesn't exists"""
pass
class ConditionNotNULLValue(ConditionNULLValue):
"""Condition: Field value is not empty"""
pass
class NodeSubexpression(ParseTreeNode):
"""Subexpression"""
def __init__(self, subexpr):
self.items = subexpr
# Parse tree generators: generate parse tree nodes from extended conditions
def generateXOf(sigma, val, condclass):
"""
Generic implementation of (1|all) of x expressions.
* condclass across all list items if x is name of definition
* condclass across all definitions if x is keyword 'them'
* condclass across all matching definition if x is wildcard expression, e.g. 'selection*'
"""
if val.matched == "them": # OR across all definitions
cond = condclass()
for name, definition in sigma.definitions.items():
if name == "timeframe":
continue
cond.add(NodeSubexpression(sigma.parse_definition(definition)))
return NodeSubexpression(cond)
elif val.matched.find("*") > 0: # OR across all matching definitions
cond = condclass()
reDefPat = re.compile("^" + val.matched.replace("*", ".*") + "$")
for name, definition in sigma.definitions.items():
if name != "timeframe" and reDefPat.match(name):
cond.add(NodeSubexpression(sigma.parse_definition(definition)))
return NodeSubexpression(cond)
else: # OR across all items of definition
return NodeSubexpression(sigma.parse_definition_byname(val.matched, condclass))
def generateAllOf(sigma, op, val):
"""Convert 'all of x' expressions into ConditionAND"""
return generateXOf(sigma, val, ConditionAND)
def generateOneOf(sigma, op, val):
"""Convert '1 of x' expressions into ConditionOR"""
return generateXOf(sigma, val, ConditionOR)
def convertId(sigma, op):
"""Convert search identifiers (lists or maps) into condition nodes according to spec defaults"""
return NodeSubexpression(sigma.parse_definition_byname(op.matched))
# Optimizer
class SigmaConditionOptimizer:
"""
Optimizer for the parsed AST.
"""
def _stripSubexpressionNode(self, node):
"""
Recursively strips all subexpressions (i.e. brackets) from the AST.
"""
if type(node) == NodeSubexpression:
assert(type(node.items) != list)
return self._stripSubexpressionNode(node.items)
if hasattr(node, 'items') and type(node) is not ConditionNOT:
node.items = list(map(self._stripSubexpressionNode, node.items))
return node
def _unstripSubexpressionNode(self, node):
"""
Recursively adds brackets around AND and OR operations in the AST.
"""
if type(node) in (ConditionAND, ConditionOR):
newnode = NodeSubexpression(node)
node.items = list(map(self._unstripSubexpressionNode, node.items))
return newnode
return node
def _ordered_uniq(self, l):
"""
Remove duplicate entries in list *l* while preserving order.
Used to be fast before it needed to work around list instead of
tuple being used for lists within definitions in the AST.
"""
seen = set()
#return [x for x in l if x not in seen and not seen.add(x)]
uniq = []
for x in l:
if type(x) == tuple and type(x[1]) == list:
x = (x[0], tuple(x[1]))
if x not in seen and not seen.add(x):
uniq.append(x)
out = []
for x in uniq:
if type(x) == tuple and type(x[1]) == tuple:
out.append((x[0], list(x[1])))
else:
out.append(x)
return out
def _optimizeNode(self, node, changes=False):
"""
Recursively optimize the AST rooted at *node* once. Returns the new
root node and a boolean indicating if the tree was changed in this
invocation or any of the recursive sub-invocations.
You MUST remove all subexpression nodes from the AST before calling
this function. Subexpressions are implicit around AND/OR nodes.
"""
if type(node) in (ConditionOR, ConditionAND):
# Remove empty OR(X), AND(X)
if len(node.items) == 0:
return None, True
if None in node.items:
node.items = [item for item in node.items if item != None]
return self._optimizeNode(node, changes=True)
# OR(X), AND(X) => X
if len(node.items) == 1:
return self._optimizeNode(node.items[0], changes=True)
# OR(X, X, ...), AND(X, X, ...) => OR(X, ...), AND(X, ...)
uniq_items = self._ordered_uniq(node.items)
if len(uniq_items) < len(node.items):
node.items = uniq_items
return self._optimizeNode(node, changes=True)
# OR(X, OR(Y)) => OR(X, Y)
if any(type(child) == type(node) for child in node.items) and \
all(type(child) in (type(node), tuple) for child in node.items):
newitems = []
for child in node.items:
if hasattr(child, 'items'):
newitems.extend(child.items)
else:
newitems.append(child)
node.items = newitems
return self._optimizeNode(node, changes=True)
# OR(AND(X, ...), AND(X, ...)) => AND(X, OR(AND(...), AND(...)))
if type(node) == ConditionOR:
othertype = ConditionAND
else:
othertype = ConditionOR
if all(type(child) == othertype for child in node.items):
promoted = []
for cand in node.items[0]:
if all(cand in child for child in node.items[1:]):
promoted.append(cand)
if len(promoted) > 0:
for child in node.items:
for cand in promoted:
child.items.remove(cand)
newnode = othertype()
newnode.items = promoted
newnode.add(node)
return self._optimizeNode(newnode, changes=True)
# fallthrough
elif type(node) == ConditionNOT:
assert(len(node.items) == 1)
# NOT(NOT(X)) => X
if type(node.items[0]) == ConditionNOT:
assert(len(node.items[0].items) == 1)
return self._optimizeNode(node.items[0].items[0], changes=True)
# NOT(ConditionNULLValue) => ConditionNotNULLValue
if type(node.items[0]) == ConditionNULLValue:
newnode = ConditionNotNULLValue(val=node.items[0].items[0])
return self._optimizeNode(newnode, changes=True)
# NOT(ConditionNotNULLValue) => ConditionNULLValue
if type(node.items[0]) == ConditionNotNULLValue:
newnode = ConditionNULLValue(val=node.items[0].items[0])
return self._optimizeNode(newnode, changes=True)
# fallthrough
else:
return node, changes
itemresults = [self._optimizeNode(item, changes) for item in node.items]
node.items = [res[0] for res in itemresults]
if any(res[1] for res in itemresults):
changes = True
return node, changes
def optimizeTree(self, tree):
"""
Optimize the boolean expressions in the AST rooted at *tree*.
The main idea behind optimizing the AST is that less repeated terms is
generally better for backend performance. This is especially relevant
to backends that do not perform any query language optimization down
the road, such as those that generate code.
A common example for when these suboptimal rules actually occur in
practice is when a rule has multiple alternative detections that are
OR'ed together in the condition, and all of the detections include a
common element, such as the same EventID.
The following optimizations are currently performed:
- Removal of empty OR(), AND()
- OR(X), AND(X) => X
- OR(X, X, ...), AND(X, X, ...) => OR(X, ...), AND(X, ...)
- OR(X, OR(Y)) => OR(X, Y)
- OR(AND(X, ...), AND(X, ...)) => AND(X, OR(AND(...), AND(...)))
- NOT(NOT(X)) => X
- NOT(ConditionNULLValue) => ConditionNotNULLValue
- NOT(ConditionNotNULLValue) => ConditionNULLValue
Boolean logic simplification is NP-hard. To avoid backtracking,
speculative transformations that may or may not lead to a more optimal
expression were not implemented. These include for example factoring
out common operands that are not in all, but only some AND()s within an
OR(), or vice versa. Nevertheless, it is safe to assume that this
implementation performs poorly on very large expressions.
"""
tree = self._stripSubexpressionNode(tree)
changes = True
while changes:
tree, changes = self._optimizeNode(tree)
tree = self._unstripSubexpressionNode(tree)
return tree
# Condition parser
class SigmaConditionParser:
"""Parser for Sigma condition expression"""
searchOperators = [ # description of operators: (token id, number of operands, parse tree node class) - order == precedence
(SigmaConditionToken.TOKEN_ALL, 1, generateAllOf),
(SigmaConditionToken.TOKEN_ONE, 1, generateOneOf),
(SigmaConditionToken.TOKEN_ID, 0, convertId),
(SigmaConditionToken.TOKEN_NOT, 1, ConditionNOT),
(SigmaConditionToken.TOKEN_AND, 2, ConditionAND),
(SigmaConditionToken.TOKEN_OR, 2, ConditionOR),
]
def __init__(self, sigmaParser, tokens):
self.sigmaParser = sigmaParser
self.config = sigmaParser.config
self._optimizer = SigmaConditionOptimizer()
if SigmaConditionToken.TOKEN_PIPE in tokens: # Condition contains atr least one aggregation expression
pipepos = tokens.index(SigmaConditionToken.TOKEN_PIPE)
self.parsedSearch = self.parseSearch(tokens[:pipepos])
self.parsedAgg = SigmaAggregationParser(tokens[pipepos + 1:], self.sigmaParser, self.config)
else:
self.parsedSearch = self.parseSearch(tokens)
self.parsedAgg = None
def parseSearch(self, tokens):
"""
Iterative parsing of search expression.
"""
# 1. Identify subexpressions with parentheses around them and parse them like a separate search expression
while SigmaConditionToken.TOKEN_LPAR in tokens:
lPos = tokens.index(SigmaConditionToken.TOKEN_LPAR)
lTok = tokens[lPos]
try:
rPos = tokens.index(SigmaConditionToken.TOKEN_RPAR)
rTok = tokens[rPos]
except ValueError as e:
raise SigmaParseError("Missing matching closing parentheses") from e
if lPos + 1 == rPos:
raise SigmaParseError("Empty subexpression at " + str(lTok.pos))
if lPos > rPos:
raise SigmaParseError("Closing parentheses at position " + str(rTok.pos) + " precedes opening at position " + str(lTok.pos))
subparsed = self.parseSearch(tokens[lPos + 1:rPos])
tokens = tokens[:lPos] + NodeSubexpression(subparsed) + tokens[rPos + 1:] # replace parentheses + expression with group node that contains parsed subexpression
# 2. Iterate over all known operators in given precedence
for operator in self.searchOperators:
# 3. reduce all occurrences into corresponding parse tree nodes
while operator[0] in tokens:
pos_op = tokens.index(operator[0])
tok_op = tokens[pos_op]
if operator[1] == 0: # operator
treenode = operator[2](self.sigmaParser, tok_op)
tokens = tokens[:pos_op] + treenode + tokens[pos_op + 1:]
elif operator[1] == 1: # operator value
pos_val = pos_op + 1
tok_val = tokens[pos_val]
treenode = operator[2](self.sigmaParser, tok_op, tok_val)
tokens = tokens[:pos_op] + treenode + tokens[pos_val + 1:]
elif operator[1] == 2: # value1 operator value2
pos_val1 = pos_op - 1
pos_val2 = pos_op + 1
tok_val1 = tokens[pos_val1]
tok_val2 = tokens[pos_val2]
treenode = operator[2](self.sigmaParser, tok_op, tok_val1, tok_val2)
tokens = tokens[:pos_val1] + treenode + tokens[pos_val2 + 1:]
if len(tokens) != 1: # parse tree must begin with exactly one node
raise ValueError("Parse tree must have exactly one start node!")
query_cond = tokens[0]
# 4. Integrate conditions from logsources in configurations
ls_cond = self.sigmaParser.get_logsource_condition()
if ls_cond is not None:
cond = ConditionAND()
cond.add(ls_cond)
cond.add(query_cond)
query_cond = cond
return self._optimizer.optimizeTree(query_cond)
def __str__(self): # pragma: no cover
return str(self.parsedSearch)
def __len__(self): # pragma: no cover
return len(self.parsedSearch)
# Aggregation parser
class SigmaAggregationParser(SimpleParser):
"""Parse Sigma aggregation expression and provide parsed data"""
parsingrules = [
{ # State 0
SigmaConditionToken.TOKEN_AGG: ("aggfunc", "trans_aggfunc", 1),
SigmaConditionToken.TOKEN_NEAR: ("aggfunc", "init_near_parsing", 8),
},
{ # State 1
SigmaConditionToken.TOKEN_LPAR: (None, None, 2)
},
{ # State 2
SigmaConditionToken.TOKEN_RPAR: (None, None, 4),
SigmaConditionToken.TOKEN_ID: ("aggfield", "trans_fieldname", 3),
},
{ # State 3
SigmaConditionToken.TOKEN_RPAR: (None, None, 4)
},
{ # State 4
SigmaConditionToken.TOKEN_BY: ("cond_op", None, 5),
SigmaConditionToken.TOKEN_EQ: ("cond_op", None, 7),
SigmaConditionToken.TOKEN_LT: ("cond_op", None, 7),
SigmaConditionToken.TOKEN_LTE: ("cond_op", None, 7),
SigmaConditionToken.TOKEN_GT: ("cond_op", None, 7),
SigmaConditionToken.TOKEN_GTE: ("cond_op", None, 7),
},
{ # State 5
SigmaConditionToken.TOKEN_ID: ("groupfield", "trans_fieldname", 6)
},
{ # State 6
SigmaConditionToken.TOKEN_EQ: ("cond_op", None, 7),
SigmaConditionToken.TOKEN_LT: ("cond_op", None, 7),
SigmaConditionToken.TOKEN_LTE: ("cond_op", None, 7),
SigmaConditionToken.TOKEN_GT: ("cond_op", None, 7),
SigmaConditionToken.TOKEN_GTE: ("cond_op", None, 7),
},
{ # State 7
SigmaConditionToken.TOKEN_ID: ("condition", None, -1)
},
{ # State 8
SigmaConditionToken.TOKEN_ID: (None, "store_search_id", 9)
},
{ # State 9
SigmaConditionToken.TOKEN_AND: (None, "set_include", 10),
},
{ # State 10
SigmaConditionToken.TOKEN_NOT: (None, "set_exclude", 8),
SigmaConditionToken.TOKEN_ID: (None, "store_search_id", 9),
},
]
finalstates = { -1, 9 }
# Aggregation functions
AGGFUNC_COUNT = 1
AGGFUNC_MIN = 2
AGGFUNC_MAX = 3
AGGFUNC_AVG = 4
AGGFUNC_SUM = 5
AGGFUNC_NEAR = 6
aggfuncmap = {
"count": AGGFUNC_COUNT,
"min": AGGFUNC_MIN,
"max": AGGFUNC_MAX,
"avg": AGGFUNC_AVG,
"sum": AGGFUNC_SUM,
"near": AGGFUNC_NEAR,
}
def __init__(self, tokens, parser, config):
self.parser = parser
self.config = config
self.aggfield = None
self.groupfield = None
super().__init__(tokens)
def trans_aggfunc(self, name):
"""Translate aggregation function name into constant"""
try:
return self.aggfuncmap[name]
except KeyError:
raise SigmaParseError("Unknown aggregation function '%s'" % (name))
def trans_fieldname(self, fieldname):
"""Translate field name into configured mapped name"""
mapped = self.config.get_fieldmapping(fieldname).resolve_fieldname(fieldname, self.parser)
if type(mapped) == str:
return mapped
else:
raise NotImplementedError("Field mappings in aggregations must be single valued")
def init_near_parsing(self, name):
"""Initialize data structures for 'near" aggregation operator parsing"""
self.include = list()
self.exclude = list()
self.current = self.include
return self.trans_aggfunc(name)
def store_search_id(self, name):
self.current.append(name)
return name
def set_include(self, name):
self.current = self.include
def set_exclude(self, name):
self.current = self.exclude