SigmaHQ/tools/sigma/parser/condition.py

# Sigma parser
# Copyright 2016-2017 Thomas Patzke, Florian Roth

# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.

# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Lesser General Public License for more details.

# You should have received a copy of the GNU Lesser General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

import re
from .base import SimpleParser
from .exceptions import SigmaParseError

COND_NONE = 0
COND_AND  = 1
COND_OR   = 2
COND_NOT  = 3
COND_NULL = 4

# Debugging code
def dumpNode(node, indent=''):   # pragma: no cover
    """
    Recursively print the AST rooted at *node* for debugging.
    """
    if hasattr(node, 'items'):
        print("%s%s<%s>" % (indent, type(node).__name__,
                            type(node.items).__name__))
        if type(node.items) != list:
            dumpNode(node.items, indent + '  ')
        else:
            for item in node.items:
                dumpNode(item, indent + '  ')
    else:
        print("%s%s=%s" % (indent, type(node).__name__,
                                   repr(node)))
    return node

# Condition Tokenizer
class SigmaConditionToken:
    """Token of a Sigma condition expression"""
    TOKEN_AND    = 1
    TOKEN_OR     = 2
    TOKEN_NOT    = 3
    TOKEN_ID     = 4
    TOKEN_LPAR   = 5
    TOKEN_RPAR   = 6
    TOKEN_PIPE   = 7
    TOKEN_ONE    = 8
    TOKEN_ALL    = 9
    TOKEN_AGG    = 10
    TOKEN_EQ     = 11
    TOKEN_LT     = 12
    TOKEN_LTE    = 13
    TOKEN_GT     = 14
    TOKEN_GTE    = 15
    TOKEN_BY     = 16
    TOKEN_NEAR   = 17

    tokenstr = [
            "INVALID",
            "AND",
            "OR",
            "NOT",
            "ID",
            "LPAR",
            "RPAR",
            "PIPE",
            "ONE",
            "ALL",
            "AGG",
            "EQ",
            "LT",
            "LTE",
            "GT",
            "GTE",
            "BY",
            "NEAR",
            ]

    def __init__(self, tokendef, match, pos):
        self.type = tokendef[0]
        self.matched = match.group()
        self.pos = pos

    def __eq__(self, other):
        if type(other) == int:      # match against type
            return self.type == other
        if type(other) == str:      # match against content
            return self.matched == other
        else:
            raise NotImplementedError("SigmaConditionToken can only be compared against token type constants")

    def __str__(self):  # pragma: no cover
        return "[ Token: %s: '%s' ]" % (self.tokenstr[self.type], self.matched)

class SigmaConditionTokenizer:
    """Tokenize condition string into token sequence"""
    tokendefs = [      # list of tokens, preferred recognition in given order, (token identifier, matching regular expression). Ignored if token id == None
            (SigmaConditionToken.TOKEN_ONE,    re.compile("1 of", re.IGNORECASE)),
            (SigmaConditionToken.TOKEN_ALL,    re.compile("all of", re.IGNORECASE)),
            (None,       re.compile("[\\s\\r\\n]+")),
            (SigmaConditionToken.TOKEN_AGG,    re.compile("count|min|max|avg|sum", re.IGNORECASE)),
            (SigmaConditionToken.TOKEN_NEAR,   re.compile("near", re.IGNORECASE)),
            (SigmaConditionToken.TOKEN_BY,     re.compile("by", re.IGNORECASE)),
            (SigmaConditionToken.TOKEN_EQ,     re.compile("==")),
            (SigmaConditionToken.TOKEN_LT,     re.compile("<")),
            (SigmaConditionToken.TOKEN_LTE,    re.compile("<=")),
            (SigmaConditionToken.TOKEN_GT,     re.compile(">")),
            (SigmaConditionToken.TOKEN_GTE,    re.compile(">=")),
            (SigmaConditionToken.TOKEN_PIPE,   re.compile("\\|")),
            (SigmaConditionToken.TOKEN_AND,    re.compile("and", re.IGNORECASE)),
            (SigmaConditionToken.TOKEN_OR,     re.compile("or", re.IGNORECASE)),
            (SigmaConditionToken.TOKEN_NOT,    re.compile("not", re.IGNORECASE)),
            (SigmaConditionToken.TOKEN_ID,     re.compile("[\\w*]+")),
            (SigmaConditionToken.TOKEN_LPAR,   re.compile("\\(")),
            (SigmaConditionToken.TOKEN_RPAR,   re.compile("\\)")),
            ]

    def __init__(self, condition):
        if type(condition) == str:          # String that is parsed
            self.tokens = list()
            pos = 1

            while len(condition) > 0:
                for tokendef in self.tokendefs:     # iterate over defined tokens and try to recognize the next one
                    match = tokendef[1].match(condition)
                    if match:
                        if tokendef[0] != None:
                            self.tokens.append(SigmaConditionToken(tokendef, match, pos + match.start()))
                        pos += match.end()      # increase position and cut matched prefix from condition
                        condition = condition[match.end():]
                        break
                else:   # no valid token identified
                    raise SigmaParseError("Unexpected token in condition at position %s" % condition)
        elif type(condition) == list:       # List of tokens to be converted into SigmaConditionTokenizer class
            self.tokens = condition
        else:
            raise TypeError("SigmaConditionTokenizer constructor expects string or list, got %s" % (type(condition)))

    def __str__(self):  # pragma: no cover
        return " ".join([str(token) for token in self.tokens])

    def __iter__(self):
        return iter(self.tokens)

    def __len__(self):
        return len(self.tokens)

    def __getitem__(self, i):
        if type(i) == int:
            return self.tokens[i]
        elif type(i) == slice:
            return SigmaConditionTokenizer(self.tokens[i])
        else:
            raise IndexError("Expected index or slice")

    def __add__(self, other):
        if isinstance(other, SigmaConditionTokenizer):
            return SigmaConditionTokenizer(self.tokens + other.tokens)
        elif isinstance(other, (SigmaConditionToken, ParseTreeNode)):
            return SigmaConditionTokenizer(self.tokens + [ other ])
        else:
            raise TypeError("+ operator expects SigmaConditionTokenizer or token type, got %s: %s" % (type(other), str(other)))

    def index(self, item):
        return self.tokens.index(item)

### Parse Tree Node Classes ###
class ParseTreeNode:
    """Parse Tree Node Base Class"""
    def __init__(self):
        raise NotImplementedError("ConditionBase is no usable class")

    def __str__(self):  # pragma: no cover
        return "[ %s: %s ]" % (self.__doc__, str([str(item) for item in self.items]))

class ConditionBase(ParseTreeNode):
    """Base class for conditional operations"""
    op = COND_NONE
    items = None

    def __init__(self):
        raise NotImplementedError("ConditionBase is no usable class")

    def add(self, item):
        self.items.append(item)

    def __iter__(self):
        return iter(self.items)

    def __len__(self):
        return len(self.items)

class ConditionAND(ConditionBase):
    """AND Condition"""
    op = COND_AND

    def __init__(self, sigma=None, op=None, val1=None, val2=None):
        if sigma == None and op == None and val1 == None and val2 == None:    # no parameters given - initialize empty
            self.items = list()
        else:       # called by parser, use given values
            self.items = [ val1, val2 ]

class ConditionOR(ConditionAND):
    """OR Condition"""
    op = COND_OR

class ConditionNOT(ConditionBase):
    """NOT Condition"""
    op = COND_NOT

    def __init__(self, sigma=None, op=None, val=None):
        if sigma == None and op == None and val == None:    # no parameters given - initialize empty
            self.items = list()
        else:       # called by parser, use given values
            self.items = [ val ]

    def add(self, item):
        if len(self.items) == 0:
            super.add(item)
        else:
            raise ValueError("Only one element allowed")

    @property
    def item(self):
        try:
            return self.items[0]
        except IndexError:
            return None

class ConditionNULLValue(ConditionNOT):
    """Condition: Field value is empty or doesn't exists"""
    pass

class ConditionNotNULLValue(ConditionNULLValue):
    """Condition: Field value is not empty"""
    pass

class NodeSubexpression(ParseTreeNode):
    """Subexpression"""
    def __init__(self, subexpr):
        self.items = subexpr

# Parse tree generators: generate parse tree nodes from extended conditions
def generateXOf(sigma, val, condclass):
    """
    Generic implementation of (1|all) of x expressions.
        
    * condclass across all list items if x is name of definition
    * condclass across all definitions if x is keyword 'them'
    * condclass across all matching definition if x is wildcard expression, e.g. 'selection*'
    """
    if val.matched == "them":           # OR across all definitions
        cond = condclass()
        for name, definition in sigma.definitions.items():
            if name == "timeframe":
                continue
            cond.add(NodeSubexpression(sigma.parse_definition(definition)))
        return NodeSubexpression(cond)
    elif val.matched.find("*") > 0:     # OR across all matching definitions
        cond = condclass()
        reDefPat = re.compile("^" + val.matched.replace("*", ".*") + "$")
        for name, definition in sigma.definitions.items():
            if name != "timeframe" and reDefPat.match(name):
                cond.add(NodeSubexpression(sigma.parse_definition(definition)))
        return NodeSubexpression(cond)
    else:                               # OR across all items of definition
        return NodeSubexpression(sigma.parse_definition_byname(val.matched, condclass))

def generateAllOf(sigma, op, val):
    """Convert 'all of x' expressions into ConditionAND"""
    return generateXOf(sigma, val, ConditionAND)

def generateOneOf(sigma, op, val):
    """Convert '1 of x' expressions into ConditionOR"""
    return generateXOf(sigma, val, ConditionOR)

def convertId(sigma, op):
    """Convert search identifiers (lists or maps) into condition nodes according to spec defaults"""
    return NodeSubexpression(sigma.parse_definition_byname(op.matched))

# Optimizer
class SigmaConditionOptimizer:
    """
    Optimizer for the parsed AST.
    """
    def _stripSubexpressionNode(self, node):
        """
        Recursively strips all subexpressions (i.e. brackets) from the AST.
        """
        if type(node) == NodeSubexpression:
            assert(type(node.items) != list)
            return self._stripSubexpressionNode(node.items)
        if hasattr(node, 'items') and type(node) is not ConditionNOT:
            node.items = list(map(self._stripSubexpressionNode, node.items))
        return node

    def _unstripSubexpressionNode(self, node):
        """
        Recursively adds brackets around AND and OR operations in the AST.
        """
        if type(node) in (ConditionAND, ConditionOR):
            newnode = NodeSubexpression(node)
            node.items = list(map(self._unstripSubexpressionNode, node.items))
            return newnode
        return node

    def _ordered_uniq(self, l):
        """
        Remove duplicate entries in list *l* while preserving order.

        Used to be fast before it needed to work around list instead of
        tuple being used for lists within definitions in the AST.
        """
        seen = set()
        #return [x for x in l if x not in seen and not seen.add(x)]
        uniq = []
        for x in l:
            if type(x) == tuple and type(x[1]) == list:
                x = (x[0], tuple(x[1]))
            if x not in seen and not seen.add(x):
                uniq.append(x)
        out = []
        for x in uniq:
            if type(x) == tuple and type(x[1]) == tuple:
                out.append((x[0], list(x[1])))
            else:
                out.append(x)
        return out

    def _optimizeNode(self, node, changes=False):
        """
        Recursively optimize the AST rooted at *node* once.  Returns the new
        root node and a boolean indicating if the tree was changed in this
        invocation or any of the recursive sub-invocations.

        You MUST remove all subexpression nodes from the AST before calling
        this function.  Subexpressions are implicit around AND/OR nodes.
        """
        if type(node) in (ConditionOR, ConditionAND):
            # Remove empty OR(X), AND(X)
            if len(node.items) == 0:
                return None, True
            if None in node.items:
                node.items = [item for item in node.items if item != None]
                return self._optimizeNode(node, changes=True)

            # OR(X), AND(X)                 =>  X
            if len(node.items) == 1:
                return self._optimizeNode(node.items[0], changes=True)

            # OR(X, X, ...), AND(X, X, ...) =>  OR(X, ...), AND(X, ...)
            uniq_items = self._ordered_uniq(node.items)
            if len(uniq_items) < len(node.items):
                node.items = uniq_items
                return self._optimizeNode(node, changes=True)

            # OR(X, OR(Y))                  =>  OR(X, Y)
            if any(type(child) == type(node) for child in node.items) and \
               all(type(child) in (type(node), tuple) for child in node.items):
                newitems = []
                for child in node.items:
                    if hasattr(child, 'items'):
                        newitems.extend(child.items)
                    else:
                        newitems.append(child)
                node.items = newitems
                return self._optimizeNode(node, changes=True)

            # OR(AND(X, ...), AND(X, ...))  =>  AND(X, OR(AND(...), AND(...)))
            if type(node) == ConditionOR:
                othertype = ConditionAND
            else:
                othertype = ConditionOR
            if all(type(child) == othertype for child in node.items):
                promoted = []
                for cand in node.items[0]:
                    if all(cand in child for child in node.items[1:]):
                        promoted.append(cand)
                if len(promoted) > 0:
                    for child in node.items:
                        for cand in promoted:
                            child.items.remove(cand)
                    newnode = othertype()
                    newnode.items = promoted
                    newnode.add(node)
                    return self._optimizeNode(newnode, changes=True)

            # fallthrough

        elif type(node) == ConditionNOT:
            assert(len(node.items) == 1)
            # NOT(NOT(X))                   =>  X
            if type(node.items[0]) == ConditionNOT:
                assert(len(node.items[0].items) == 1)
                return self._optimizeNode(node.items[0].items[0], changes=True)

            # NOT(ConditionNULLValue)       =>  ConditionNotNULLValue
            if type(node.items[0]) == ConditionNULLValue:
                newnode = ConditionNotNULLValue(val=node.items[0].items[0])
                return self._optimizeNode(newnode, changes=True)

            # NOT(ConditionNotNULLValue)    =>  ConditionNULLValue
            if type(node.items[0]) == ConditionNotNULLValue:
                newnode = ConditionNULLValue(val=node.items[0].items[0])
                return self._optimizeNode(newnode, changes=True)

            # fallthrough

        else:
            return node, changes

        itemresults = [self._optimizeNode(item, changes) for item in node.items]
        node.items = [res[0] for res in itemresults]
        if any(res[1] for res in itemresults):
            changes = True
        return node, changes

    def optimizeTree(self, tree):
        """
        Optimize the boolean expressions in the AST rooted at *tree*.

        The main idea behind optimizing the AST is that less repeated terms is
        generally better for backend performance.  This is especially relevant
        to backends that do not perform any query language optimization down
        the road, such as those that generate code.

        A common example for when these suboptimal rules actually occur in
        practice is when a rule has multiple alternative detections that are
        OR'ed together in the condition, and all of the detections include a
        common element, such as the same EventID.

        The following optimizations are currently performed:
        -   Removal of empty OR(), AND()
        -   OR(X), AND(X)                 =>  X
        -   OR(X, X, ...), AND(X, X, ...) =>  OR(X, ...), AND(X, ...)
        -   OR(X, OR(Y))                  =>  OR(X, Y)
        -   OR(AND(X, ...), AND(X, ...))  =>  AND(X, OR(AND(...), AND(...)))
        -   NOT(NOT(X))                   =>  X
        -   NOT(ConditionNULLValue)       =>  ConditionNotNULLValue
        -   NOT(ConditionNotNULLValue)    =>  ConditionNULLValue

        Boolean logic simplification is NP-hard.  To avoid backtracking,
        speculative transformations that may or may not lead to a more optimal
        expression were not implemented.  These include for example factoring
        out common operands that are not in all, but only some AND()s within an
        OR(), or vice versa.  Nevertheless, it is safe to assume that this
        implementation performs poorly on very large expressions.
        """
        tree = self._stripSubexpressionNode(tree)
        changes = True
        while changes:
            tree, changes = self._optimizeNode(tree)
        tree = self._unstripSubexpressionNode(tree)
        return tree

# Condition parser
class SigmaConditionParser:
    """Parser for Sigma condition expression"""
    searchOperators = [     # description of operators: (token id, number of operands, parse tree node class) - order == precedence
            (SigmaConditionToken.TOKEN_ALL, 1, generateAllOf),
            (SigmaConditionToken.TOKEN_ONE, 1, generateOneOf),
            (SigmaConditionToken.TOKEN_ID,  0, convertId),
            (SigmaConditionToken.TOKEN_NOT, 1, ConditionNOT),
            (SigmaConditionToken.TOKEN_AND, 2, ConditionAND),
            (SigmaConditionToken.TOKEN_OR,  2, ConditionOR),
            ]

    def __init__(self, sigmaParser, tokens):
        self.sigmaParser = sigmaParser
        self.config = sigmaParser.config
        self._optimizer = SigmaConditionOptimizer()

        if SigmaConditionToken.TOKEN_PIPE in tokens:    # Condition contains atr least one aggregation expression
            pipepos = tokens.index(SigmaConditionToken.TOKEN_PIPE)
            self.parsedSearch = self.parseSearch(tokens[:pipepos])
            self.parsedAgg = SigmaAggregationParser(tokens[pipepos + 1:], self.sigmaParser, self.config)
        else:
            self.parsedSearch = self.parseSearch(tokens)
            self.parsedAgg = None

    def parseSearch(self, tokens):
        """
        Iterative parsing of search expression.
        """
        # 1. Identify subexpressions with parentheses around them and parse them like a separate search expression
        while SigmaConditionToken.TOKEN_LPAR in tokens:
            lPos = tokens.index(SigmaConditionToken.TOKEN_LPAR)
            lTok = tokens[lPos]
            try:
                rPos = tokens.index(SigmaConditionToken.TOKEN_RPAR)
                rTok = tokens[rPos]
            except ValueError as e:
                raise SigmaParseError("Missing matching closing parentheses") from e
            if lPos + 1 == rPos:
                raise SigmaParseError("Empty subexpression at " + str(lTok.pos))
            if lPos > rPos:
                raise SigmaParseError("Closing parentheses at position " + str(rTok.pos) + " precedes opening at position " + str(lTok.pos))

            subparsed = self.parseSearch(tokens[lPos + 1:rPos])
            tokens = tokens[:lPos] + NodeSubexpression(subparsed) + tokens[rPos + 1:]   # replace parentheses + expression with group node that contains parsed subexpression

        # 2. Iterate over all known operators in given precedence
        for operator in self.searchOperators:
            # 3. reduce all occurrences into corresponding parse tree nodes
            while operator[0] in tokens:
                pos_op = tokens.index(operator[0])
                tok_op = tokens[pos_op]
                if operator[1] == 0:    # operator
                    treenode = operator[2](self.sigmaParser, tok_op)
                    tokens = tokens[:pos_op] + treenode + tokens[pos_op + 1:]
                elif operator[1] == 1:    # operator value
                    pos_val = pos_op + 1
                    tok_val = tokens[pos_val]
                    treenode = operator[2](self.sigmaParser, tok_op, tok_val)
                    tokens = tokens[:pos_op] + treenode + tokens[pos_val + 1:]
                elif operator[1] == 2:    # value1 operator value2
                    pos_val1 = pos_op - 1
                    pos_val2 = pos_op + 1
                    tok_val1 = tokens[pos_val1]
                    tok_val2 = tokens[pos_val2]
                    treenode = operator[2](self.sigmaParser, tok_op, tok_val1, tok_val2)
                    tokens = tokens[:pos_val1] + treenode + tokens[pos_val2 + 1:]

        if len(tokens) != 1:     # parse tree must begin with exactly one node
            raise ValueError("Parse tree must have exactly one start node!")
        query_cond = tokens[0]

        # 4. Integrate conditions from logsources in configurations
        ls_cond = self.sigmaParser.get_logsource_condition()
        if ls_cond is not None:
            cond = ConditionAND()
            cond.add(ls_cond)
            cond.add(query_cond)
            query_cond = cond

        return self._optimizer.optimizeTree(query_cond)

    def __str__(self):  # pragma: no cover
        return str(self.parsedSearch)

    def __len__(self):  # pragma: no cover
        return len(self.parsedSearch)
 
# Aggregation parser
class SigmaAggregationParser(SimpleParser):
    """Parse Sigma aggregation expression and provide parsed data"""
    parsingrules = [
            {   # State 0
                SigmaConditionToken.TOKEN_AGG:  ("aggfunc", "trans_aggfunc", 1),
                SigmaConditionToken.TOKEN_NEAR: ("aggfunc", "init_near_parsing", 8),
            },
            {   # State 1
                SigmaConditionToken.TOKEN_LPAR: (None, None, 2)
            },
            {   # State 2
                SigmaConditionToken.TOKEN_RPAR: (None, None, 4),
                SigmaConditionToken.TOKEN_ID: ("aggfield", "trans_fieldname", 3),
            },
            {   # State 3
                SigmaConditionToken.TOKEN_RPAR: (None, None, 4)
            },
            {   # State 4
                SigmaConditionToken.TOKEN_BY: ("cond_op", None, 5),
                SigmaConditionToken.TOKEN_EQ: ("cond_op", None, 7),
                SigmaConditionToken.TOKEN_LT: ("cond_op", None, 7),
                SigmaConditionToken.TOKEN_LTE: ("cond_op", None, 7),
                SigmaConditionToken.TOKEN_GT: ("cond_op", None, 7),
                SigmaConditionToken.TOKEN_GTE: ("cond_op", None, 7),
            },
            {   # State 5
                SigmaConditionToken.TOKEN_ID: ("groupfield", "trans_fieldname", 6)
            },
            {   # State 6
                SigmaConditionToken.TOKEN_EQ: ("cond_op", None, 7),
                SigmaConditionToken.TOKEN_LT: ("cond_op", None, 7),
                SigmaConditionToken.TOKEN_LTE: ("cond_op", None, 7),
                SigmaConditionToken.TOKEN_GT: ("cond_op", None, 7),
                SigmaConditionToken.TOKEN_GTE: ("cond_op", None, 7),
            },
            {   # State 7
                SigmaConditionToken.TOKEN_ID: ("condition", None, -1)
            },
            {   # State 8
                SigmaConditionToken.TOKEN_ID: (None, "store_search_id", 9)
            },
            {   # State 9
                SigmaConditionToken.TOKEN_AND: (None, "set_include", 10),
            },
            {   # State 10
                SigmaConditionToken.TOKEN_NOT: (None, "set_exclude", 8),
                SigmaConditionToken.TOKEN_ID: (None, "store_search_id", 9),
            },
            ]
    finalstates = { -1, 9 }

    # Aggregation functions
    AGGFUNC_COUNT = 1
    AGGFUNC_MIN   = 2
    AGGFUNC_MAX   = 3
    AGGFUNC_AVG   = 4
    AGGFUNC_SUM   = 5
    AGGFUNC_NEAR  = 6
    aggfuncmap = {
            "count": AGGFUNC_COUNT,
            "min":   AGGFUNC_MIN,
            "max":   AGGFUNC_MAX,
            "avg":   AGGFUNC_AVG,
            "sum":   AGGFUNC_SUM,
            "near":  AGGFUNC_NEAR,
            }

    def __init__(self, tokens, parser, config):
        self.parser = parser
        self.config = config
        self.aggfield = None
        self.groupfield = None
        super().__init__(tokens)

    def trans_aggfunc(self, name):
        """Translate aggregation function name into constant"""
        try:
            return self.aggfuncmap[name]
        except KeyError:
            raise SigmaParseError("Unknown aggregation function '%s'" % (name))

    def trans_fieldname(self, fieldname):
        """Translate field name into configured mapped name"""
        mapped = self.config.get_fieldmapping(fieldname).resolve_fieldname(fieldname, self.parser)
        if type(mapped) == str:
            return mapped
        else:
            raise NotImplementedError("Field mappings in aggregations must be single valued")

    def init_near_parsing(self, name):
        """Initialize data structures for 'near" aggregation operator parsing"""
        self.include = list()
        self.exclude = list()
        self.current = self.include
        return self.trans_aggfunc(name)

    def store_search_id(self, name):
        self.current.append(name)
        return name

    def set_include(self, name):
        self.current = self.include

    def set_exclude(self, name):
        self.current = self.exclude