riak_test/tests/ts_data.erl

%% -*- Mode: Erlang -*-
%% -------------------------------------------------------------------
%%
%% Copyright (c) 2015, 2016 Basho Technologies, Inc.
%%
%% This file is provided to you under the Apache License,
%% Version 2.0 (the "License"); you may not use this file
%% except in compliance with the License.  You may obtain
%% a copy of the License at
%%
%%   http://www.apache.org/licenses/LICENSE-2.0
%%
%% Unless required by applicable law or agreed to in writing,
%% software distributed under the License is distributed on an
%% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
%% KIND, either express or implied.  See the License for the
%% specific language governing permissions and limitations
%% under the License.
%%
%% -------------------------------------------------------------------
%% @doc A util module for riak_ts tests

-module(ts_data).

-include_lib("eunit/include/eunit.hrl").

-export([get_ddl/0, get_ddl/1, get_ddl/2,
         assert/3,
         assert_error_regex/3,
         assert_float/3,
         assert_row_sets/2,
         exclusive_result_from_data/3,
         flat_format/2,
         get_bool/1,
         get_cols/0, get_cols/1,
         get_data/1,
         get_default_bucket/0,
         get_float/0,
         get_integer/0,
         get_invalid_obj/0,
         get_invalid_qry/1,
         get_long_obj/0,
         get_map/1,
         get_optional/2,
         get_short_obj/0,
         get_string/1,
         get_timestamp/0,
         get_valid_aggregation_data/1,
         get_valid_aggregation_data_not_null/1,
         get_valid_big_data/1,
         get_valid_obj/0,
         get_valid_qry/0, get_valid_qry/1,
         get_valid_qry/2, get_valid_qry/3,
         get_valid_qry_spanning_quanta/0,
         get_valid_select_data/0, get_valid_select_data/1,
         get_valid_select_data_spanning_quanta/0,
         get_varchar/0,
         remove_last/1,
         results/1
        ]).

-define(MAXVARCHARLEN,       16).
-define(MAXTIMESTAMP,        trunc(math:pow(2, 63))).
-define(MAXFLOAT,            math:pow(2, 63)).

%% This is also the name of the table
get_default_bucket() ->
    "GeoCheckin".

get_valid_qry() ->
    get_valid_qry(get_default_bucket()).

get_valid_qry(Table) ->
    "select * from " ++ Table ++ " Where time > 1 and time < 10 and myfamily = 'family1' and myseries ='seriesX'".

get_valid_qry(Lower, Upper) ->
    get_valid_qry(Lower, Upper, get_default_bucket()).

get_valid_qry(Lower, Upper, Table) ->
    lists:flatten(io_lib:format("select * from " ++ Table ++ " Where time > ~B and time < ~B and myfamily = 'family1' and myseries ='seriesX'", [Lower, Upper])).

get_invalid_qry(borked_syntax) ->
    "selectah * from GeoCheckin Where time > 1 and time < 10";
get_invalid_qry(key_not_covered) ->
    "select * from GeoCheckin Where time > 1 and time < 10";
get_invalid_qry(invalid_operator) ->
    "select * from GeoCheckin Where time > 1 and time < 10 and myfamily = 'family1' and myseries ='seriesX' and weather > 'bob'";
get_invalid_qry(field_comparison) ->
    "select * from GeoCheckin Where time > 1 and time < 10 and myfamily = 'family1' and myseries ='seriesX' and weather = family1";
get_invalid_qry(type_error) ->
    "select * from GeoCheckin Where time > 1 and time < 10 and myfamily = 'family1' and myseries ='seriesX' and weather > true".

get_valid_select_data() ->
    get_valid_select_data(fun() -> lists:seq(1, 10) end).

get_valid_select_data(SeqFun) ->
    Family = <<"family1">>,
    Series = <<"seriesX">>,
    Times = SeqFun(),
    [{Family, Series, X, get_varchar(), get_float()} || X <- Times].


-define(SPANNING_STEP_BIG, (1000)).

get_valid_big_data(N) ->
    Family = <<"family1">>,
    Series = <<"seriesX">>,
    Times = lists:seq(1, N),
    [{
        Family,
        Series,
        1 + N * ?SPANNING_STEP_BIG,
        N,
        N + 0.1,
        get_bool(X),
        N + 100000,
        get_varchar(),
        get_optional(X, X)
    } || X <- Times].

get_valid_aggregation_data(N) ->
    Family = <<"family1">>,
    Series = <<"seriesX">>,
    Times = lists:seq(1, N),
    [{Family, Series, X,
      get_optional(X, get_float()),
      get_optional(X+1, get_float()),
      get_optional(X*3, get_float())} || X <- Times].

get_valid_aggregation_data_not_null(N) ->
    Family = <<"family1">>,
    Series = <<"seriesX">>,
    Times = lists:seq(1, N),
    [{Family, Series, X,
      get_float(),
      get_float(),
      get_float()} || X <- Times].

-define(SPANNING_STEP, (1000*60*5)).

get_valid_qry_spanning_quanta() ->
    StartTime = 1 + ?SPANNING_STEP *  1,
    EndTime   = 1 + ?SPANNING_STEP * 10,
    lists:flatten(
      io_lib:format("select * from GeoCheckin Where time >= ~b and time <= ~b"
                    " and myfamily = 'family1' and myseries = 'seriesX'",
                    [StartTime, EndTime])).

get_valid_select_data_spanning_quanta() ->
    Family = <<"family1">>,
    Series = <<"seriesX">>,
    Times = lists:seq(1 + ?SPANNING_STEP, 1 + ?SPANNING_STEP * 10, ?SPANNING_STEP),  %% five-minute intervals, to span 15-min buckets
    [{Family, Series, X, get_varchar(), get_float()} || X <- Times].

get_cols() ->
    get_cols(small).
get_cols(small) ->
    [<<"myfamily">>,
     <<"myseries">>,
     <<"time">>,
     <<"weather">>,
     <<"temperature">>];
get_cols(api) ->
    [<<"myfamily">>,
     <<"myseries">>,
     <<"time">>,
     <<"myint">>,
     <<"mybin">>,
     <<"myfloat">>,
     <<"mybool">>].


exclusive_result_from_data(Data, Start, Finish) when is_integer(Start)   andalso
                                                     is_integer(Finish)  andalso
                                                     Start  > 0          andalso
                                                     Finish > 0          andalso
                                                     Finish > Start ->
    lists:sublist(Data, Start, Finish - Start + 1).

remove_last(Data) ->
    lists:sublist(Data, length(Data)-1).

%% a valid DDL - the one used in the documents
get_ddl() ->
    get_ddl(small).

get_ddl(small) ->
    get_ddl(small, get_default_bucket());
get_ddl(big) ->
    get_ddl(big, get_default_bucket());
get_ddl(api) ->
    get_ddl(api, get_default_bucket());
get_ddl(aggregation) ->
    get_ddl(aggregation, "WeatherData").


get_ddl(small, Table) ->
    "CREATE TABLE " ++ table_to_list(Table) ++ " ("
    " myfamily    varchar   not null,"
    " myseries    varchar   not null,"
    " time        timestamp not null,"
    " weather     varchar   not null,"
    " temperature double,"
    " PRIMARY KEY ((myfamily, myseries, quantum(time, 15, 'm')), "
    " myfamily, myseries, time))";

%% another valid DDL - one with all the good stuff like
%% different types and optional blah-blah
get_ddl(big, Table) ->
    "CREATE TABLE " ++ table_to_list(Table) ++ " ("
    " myfamily    varchar     not null,"
    " myseries    varchar     not null,"
    " time        timestamp   not null,"
    " myint       sint64      not null,"
    " myfloat     double      not null,"
    " mybool      boolean     not null,"
    " mytimestamp timestamp   not null,"
    " myvarchar   varchar     not null,"
    " myoptional  sint64,"
    " PRIMARY KEY ((myfamily, myseries, quantum(time, 15, 'm')),"
    " myfamily, myseries, time))";

%% the other DDLs which used to be here but were only used once, each
%% in a corresponding ts_A_create_*_fail module, have been moved to
%% those respective modules

get_ddl(api, Table) ->
    "CREATE TABLE " ++ table_to_list(Table) ++ " ("
    " myfamily    varchar     not null,"
    " myseries    varchar     not null,"
    " time        timestamp   not null,"
    " myint       sint64      not null,"
    " mybin       varchar     not null,"
    " myfloat     double      not null,"
    " mybool      boolean     not null,"
    " PRIMARY KEY ((myfamily, myseries, quantum(time, 15, 'm')),"
    " myfamily, myseries, time))";

%% DDL for testing aggregation behavior
get_ddl(aggregation, Table) ->
    "CREATE TABLE " ++ table_to_list(Table) ++ " ("
    " myfamily      varchar   not null,"
    " myseries      varchar   not null,"
    " time          timestamp not null,"
    " temperature   double,"
    " pressure      double,"
    " precipitation double,"
    " PRIMARY KEY ((myfamily, myseries, quantum(time, 10, 'm')), "
    " myfamily, myseries, time))".

table_to_list(Table) when is_binary(Table) ->
    binary_to_list(Table);
table_to_list(Table) ->
    Table.

get_data(api) ->
    [{<<"family1">>, <<"seriesX">>, 100, 1, <<"test1">>, 1.0, true}] ++
    [{<<"family1">>, <<"seriesX">>, 200, 2, <<"test2">>, 2.0, false}] ++
    [{<<"family1">>, <<"seriesX">>, 300, 3, <<"test3">>, 3.0, true}] ++
    [{<<"family1">>, <<"seriesX">>, 400, 4, <<"test4">>, 4.0, false}].

get_map(api) ->
    [{<<"myfamily">>, 1},
     {<<"myseries">>, 2},
     {<<"time">>,     3},
     {<<"myint">>,    4},
     {<<"mybin">>,    5},
     {<<"myfloat">>,  6},
     {<<"mybool">>,   7}].


get_valid_obj() ->
    {get_varchar(),
     get_varchar(),
     get_timestamp(),
     get_varchar(),
     get_float()}.

get_invalid_obj() ->
    {get_varchar(),
     get_integer(),   % this is the duff field
     get_timestamp(),
     get_varchar(),
     get_float()}.

get_short_obj() ->
    {get_varchar(),
        get_varchar(),
        get_timestamp(),
        get_varchar()}.

get_long_obj() ->
    {get_varchar(),
        get_varchar(),
        get_timestamp(),
        get_varchar(),
        get_float(),
        get_float()}.

get_varchar() ->
    Len = random:uniform(?MAXVARCHARLEN),
    String = get_string(Len),
    list_to_binary(String).

get_string(Len) ->
    get_s(Len, []).

get_integer() ->
    get_timestamp().

get_s(0, Acc) ->
    Acc;
get_s(N, Acc) when is_integer(N) andalso N > 0 ->
    %% make it plain ASCII
    get_s(N - 1, [crypto:rand_uniform($a, $z) | Acc]).

get_timestamp() ->
    random:uniform(?MAXTIMESTAMP).

get_float() ->
    F1 = random:uniform(trunc(?MAXFLOAT)),
    F2 = random:uniform(trunc(?MAXFLOAT)),
    F1 - F2 + random:uniform().

get_bool(N) when N < 5 -> true;
get_bool(_)            -> false.

get_optional(N, X) ->
    case N rem 2 of
        0 -> X;
        1 -> []
    end.


-define(DELTA, 1.0e-15).

assert_float(String, {ok, Thing1}, {ok, Thing2}) ->
    assert_float(String, Thing1, Thing2);
assert_float(String, {Cols, [ValsA]} = Exp, {Cols, [ValsB]} = Got) ->
    case assertf2(tuple_to_list(ValsA), tuple_to_list(ValsB)) of
        fail ->
            lager:info("*****************", []),
            lager:info("Test ~p failed", [String]),
            lager:info("Exp ~p", [Exp]),
            lager:info("Got ~p", [Got]),
            lager:info("*****************", []),
            fail;
        pass ->
            pass
    end;
assert_float(String, Exp, Got) ->
    assert(String, Exp, Got).

assertf2([], []) -> pass;
assertf2([H1 | T1], [H2 | T2]) when is_float(H1), is_float(H2) ->
    Diff = H1 - H2,
    Av = abs(H1 + H2)/2,
    if Diff/Av > ?DELTA -> fail;
       el/=se           -> assertf2(T1, T2)
    end;
assertf2([H | T1], [H | T2]) ->
    assertf2(T1, T2);
assertf2(_, _) ->
    fail.


assert(_,      X,   X)   -> pass;
%% as a special case, if you don't know the exact words to expect in
%% the error message, use a '_' to match any string
assert(_, {error, {ErrCode, _ErrMsg}}, {error, {ErrCode, '_'}}) ->
    pass;
assert(String, Exp, Got) ->
    lager:info("*****************", []),
    lager:info("Test ~p failed", [String]),
    lager:info("Exp ~p", [Exp]),
    lager:info("Got ~p", [Got]),
    lager:info("*****************", []),
    fail.

%% Match an error code and use a regex to match the error string
assert_error_regex(String, {error, {Code, Regex}}, {error, {Code, Msg}}) ->
    {ok, RE} = re:compile(Regex),
    Match = re:run(Msg, RE),
    assert_error_regex_result(Match, String, Regex, Msg);
assert_error_regex(String, Got, Expected) ->
    assert_error_regex_result(nomatch, String, Got, Expected).
assert_error_regex_result(nomatch, String, Expected, Got) ->
    assert(String, Expected, Got);
assert_error_regex_result(_, _String, _Expected, _Got) ->
    pass.

%% If `ColExpected' is the atom `rt_ignore_columns' then do not assert columns.
assert_row_sets(_, {error,_} = Error) ->
    ct:fail(Error);
assert_row_sets({rt_ignore_columns, Expected}, {_, {_, Actual}}) ->
    ct_verify_rows(Expected, Actual);
assert_row_sets({_, {ColExpected, Expected}}, {_, {ColsActual, Actual}}) ->
    ?assertEqual(ColExpected, ColsActual),
    ct_verify_rows(Expected, Actual);
assert_row_sets(Expected, Actual) when is_list(Expected), is_list(Actual) ->
    ct_verify_rows(Expected, Actual).

ct_verify_rows(Expected, Actual) ->
    case tdiff:diff(Expected, Actual) of
        [{eq,_}] ->
            pass;
        [] ->
            pass;
        Diff ->
            ct:pal("ROW DIFF~n" ++ format_diff(0, Diff)),
            ct:fail(row_set_mismatch)
    end.

%%
format_diff(EqCount,[]) ->
    format_diff_eq_count(EqCount);
format_diff(EqCount,[{eq,Rows}|Tail]) ->
    format_diff(EqCount+length(Rows), Tail);
format_diff(EqCount,[{Type, Row}|Tail]) ->
    Fmt = io_lib:format("~s~s ~p~n",
        [format_diff_eq_count(EqCount), format_diff_type(Type), Row]),
    [Fmt | format_diff(0, Tail)].

%%
format_diff_type(del) -> "-";
format_diff_type(ins) -> "+".

%%
format_diff_eq_count(0) ->
    "";
format_diff_eq_count(Count) ->
    [lists:duplicate(Count, $.), $\n].


results(Results) ->
    Expected = lists:duplicate(length(Results), pass),
    ?assertEqual(Expected, Results).

flat_format(Format, Args) ->
    lists:flatten(io_lib:format(Format, Args)).