Fix performance issue with the disk serializer

This is the issue noted in #76. Keeping all historical results of
queries in the HistoricalQueryResults struct makes serializing and
deserializing those structs very, very slow as time goes on. By only
storing the last execution of the query, we keep the performance
constant, but we kill the feature where osquery can rebuild timelines
without accessing logs. After talking it over, we decided that this
isn't actually that big of a deal because, if you really wanted to
rebuild the old data, you should be able to process the logs, similarly
to bin log replication in MySQL.
This commit is contained in:
mike@arpaia.co 2014-09-02 13:13:12 -07:00
parent debdb56616
commit 66a2a6fdec
9 changed files with 8 additions and 133 deletions

View File

@ -10,5 +10,7 @@ INCLUDE_DIRECTORIES("${CMAKE_SOURCE_DIR}")
INCLUDE_DIRECTORIES("/usr/local/include")
LINK_DIRECTORIES("/usr/local/lib")
ENABLE_TESTING()
ADD_SUBDIRECTORY(osquery)
ADD_SUBDIRECTORY(tools)

View File

@ -161,30 +161,15 @@ getSerializedHistoricalQueryResults() {
auto qd = getSerializedQueryData();
auto dr = getSerializedDiffResults();
HistoricalQueryResults r;
r.executions = std::deque<int>{2, 1};
r.mostRecentResults.first = 2;
r.mostRecentResults.second = qd.second;
r.pastResults[1] = dr.second;
pt::ptree root;
pt::ptree executions;
pt::ptree item1;
item1.put("", 2);
executions.push_back(std::make_pair("", item1));
pt::ptree item2;
item2.put("", 1);
executions.push_back(std::make_pair("", item2));
root.add_child("executions", executions);
pt::ptree mostRecentResults;
mostRecentResults.add_child("2", qd.first);
root.add_child("mostRecentResults", mostRecentResults);
pt::ptree pastResults;
pastResults.add_child("1", dr.first);
root.add_child("pastResults", pastResults);
return std::make_pair(root, r);
}

View File

@ -68,20 +68,6 @@ bool Query::isQueryNameInDatabase(std::shared_ptr<DBHandle> db) {
return std::find(names.begin(), names.end(), query_.name) != names.end();
}
Status Query::getExecutions(std::deque<int>& results) {
return getExecutions(results, DBHandle::getInstance());
}
Status Query::getExecutions(std::deque<int>& results,
std::shared_ptr<DBHandle> db) {
HistoricalQueryResults hQR;
auto s = getHistoricalQueryResults(hQR, db);
if (s.ok()) {
results = hQR.executions;
}
return s;
}
Status Query::addNewResults(const osquery::db::QueryData& qd, int unix_time) {
return addNewResults(qd, unix_time, DBHandle::getInstance());
}
@ -112,10 +98,8 @@ osquery::Status Query::addNewResults(const osquery::db::QueryData& qd,
if (calculate_diff) {
dr = diff(hQR.mostRecentResults.second, qd);
}
hQR.pastResults[hQR.mostRecentResults.first] = dr;
hQR.mostRecentResults.first = unix_time;
hQR.mostRecentResults.second = qd;
hQR.executions.push_front(unix_time);
std::string json;
auto serialize_status = serializeHistoricalQueryResultsJSON(hQR, json);
if (!serialize_status.ok()) {

View File

@ -75,16 +75,6 @@ class Query {
private:
bool isQueryNameInDatabase(std::shared_ptr<DBHandle> db);
// getExecutions() returns a deque of timestamps of previous query
// executions. These timestamp values are used as the RocksDB sub-keys which
// represent the data stored as a result of those executions.
public:
osquery::Status getExecutions(std::deque<int>& results);
private:
osquery::Status getExecutions(std::deque<int>& results,
std::shared_ptr<DBHandle> db);
// addNewResults adds a new result set to the local data store. If you
// want the diff of the results you've just added, pass a reference to a
// diffResults struct

View File

@ -114,22 +114,6 @@ TEST_F(QueryTests, test_get_stored_query_names) {
EXPECT_NE(in_vector, names.end());
}
TEST_F(QueryTests, test_get_executions) {
auto hQR = getSerializedHistoricalQueryResultsJSON();
auto query = getOsqueryScheduledQuery();
auto db = DBHandle::getInstanceAtPath("/tmp/rocksdb-osquery-test14");
auto put_status = db->Put(kQueries, query.name, hQR.first);
EXPECT_TRUE(put_status.ok());
EXPECT_EQ(put_status.toString(), "OK");
auto cf = Query(query);
std::deque<int> results;
std::deque<int> expected = {2, 1};
auto s = cf.getExecutions(results, db);
EXPECT_TRUE(s.ok());
EXPECT_EQ(s.toString(), "OK");
EXPECT_EQ(results, expected);
}
TEST_F(QueryTests, test_get_current_results) {
auto hQR = getSerializedHistoricalQueryResultsJSON();
auto query = getOsqueryScheduledQuery();

View File

@ -157,16 +157,7 @@ Status serializeHistoricalQueryResultsJSON(const HistoricalQueryResults& r,
Status serializeHistoricalQueryResults(const HistoricalQueryResults& r,
pt::ptree& tree) {
try {
pt::ptree executions;
pt::ptree mostRecentResults;
pt::ptree pastResults;
for (const auto& e : r.executions) {
pt::ptree item;
item.put("", e);
executions.push_back(std::make_pair("", item));
}
tree.add_child("executions", executions);
pt::ptree most_recent_serialized;
auto mrr_status =
@ -178,17 +169,6 @@ Status serializeHistoricalQueryResults(const HistoricalQueryResults& r,
boost::lexical_cast<std::string>(r.mostRecentResults.first),
most_recent_serialized);
tree.add_child("mostRecentResults", mostRecentResults);
for (const auto& i : r.pastResults) {
pt::ptree serialized_diff_results;
auto dr_status = serializeDiffResults(i.second, serialized_diff_results);
if (!dr_status.ok()) {
return dr_status;
}
pastResults.add_child(boost::lexical_cast<std::string>(i.first),
serialized_diff_results);
}
tree.add_child("pastResults", pastResults);
}
catch (const std::exception& e) {
return Status(1, e.what());
@ -199,17 +179,6 @@ Status serializeHistoricalQueryResults(const HistoricalQueryResults& r,
Status deserializeHistoricalQueryResults(const pt::ptree& tree,
HistoricalQueryResults& r) {
try {
for (const auto& v : tree.get_child("executions")) {
try {
int execution =
boost::lexical_cast<int>(v.second.get_value<std::string>());
r.executions.push_back(execution);
}
catch (const boost::bad_lexical_cast& e) {
return Status(1, e.what());
}
}
for (const auto& v : tree.get_child("mostRecentResults")) {
try {
int execution = boost::lexical_cast<int>(v.first);
@ -218,6 +187,7 @@ Status deserializeHistoricalQueryResults(const pt::ptree& tree,
catch (const boost::bad_lexical_cast& e) {
return Status(1, e.what());
}
QueryData q;
for (const auto& each : v.second) {
Row row_;
@ -229,32 +199,6 @@ Status deserializeHistoricalQueryResults(const pt::ptree& tree,
r.mostRecentResults.second = q;
}
for (const auto& v : tree.get_child("pastResults")) {
int execution;
try {
execution = boost::lexical_cast<int>(v.first);
}
catch (const boost::bad_lexical_cast& e) {
return Status(1, e.what());
}
DiffResults dr;
for (const auto& a : v.second.get_child("added")) {
Row row_;
for (const auto& each : a.second) {
row_[each.first] = each.second.get_value<std::string>();
}
dr.added.push_back(row_);
}
for (const auto& r : v.second.get_child("removed")) {
Row row_;
for (const auto& each : r.second) {
row_[each.first] = each.second.get_value<std::string>();
}
dr.removed.push_back(row_);
}
r.pastResults[execution] = dr;
}
return Status(0, "OK");
}
catch (const std::exception& e) {

View File

@ -98,25 +98,14 @@ DiffResults diff(const QueryData& old_, const QueryData& new_);
// HistoricalQueryResults is a struct which represents a scheduled query's
// historical results on disk
struct HistoricalQueryResults {
// a vector of timestamps, sorted by time, starting with the most recent
std::deque<int> executions;
// mostRecentResults->first is the timestamp of the most recent results and
// mostRecentResults->second is the query result data of the most recent
// query
std::pair<int, QueryData> mostRecentResults;
// pastResults is a map of timestamps to data diffs. All timestamps can be
// found pre-sorted in executions. By taking mostRecentResults->second and
// applying the diffs from pastResults (in time order), you can reconstruct
// the entire history of a query's results
std::map<int, DiffResults> pastResults;
// equals operator
bool operator==(const HistoricalQueryResults& comp) const {
return (comp.executions == executions) &&
(comp.mostRecentResults == mostRecentResults) &&
(comp.pastResults == pastResults);
return (comp.mostRecentResults == mostRecentResults);
}
// not equals operator

View File

@ -90,9 +90,7 @@ TEST_F(ResultsTests, test_deserialize_historical_query_results) {
HistoricalQueryResults r;
auto s = deserializeHistoricalQueryResults(results.first, r);
EXPECT_EQ(results.second, r);
EXPECT_EQ(results.second.executions, r.executions);
EXPECT_EQ(results.second.mostRecentResults, r.mostRecentResults);
EXPECT_EQ(results.second.pastResults, r.pastResults);
EXPECT_TRUE(s.ok());
EXPECT_EQ(s.toString(), "OK");
}
@ -102,9 +100,7 @@ TEST_F(ResultsTests, test_deserialize_historical_query_results_json) {
HistoricalQueryResults r;
auto s = deserializeHistoricalQueryResultsJSON(results.first, r);
EXPECT_EQ(results.second, r);
EXPECT_EQ(results.second.executions, r.executions);
EXPECT_EQ(results.second.mostRecentResults, r.mostRecentResults);
EXPECT_EQ(results.second.pastResults, r.pastResults);
EXPECT_TRUE(s.ok());
EXPECT_EQ(s.toString(), "OK");
}

View File

@ -87,17 +87,18 @@ Row parseLaunchdItem(const std::string& path, const pt::ptree& tree) {
r["name"] = bits[bits.size() - 1];
for (const auto& it : kLaunchdTopLevelStringKeys) {
std::string item;
try {
std::string item = tree.get<std::string>(it.first);
item = tree.get<std::string>(it.first);
if (it.first == "Program") {
boost::replace_all(item, " ", "\\ ");
}
r[it.second] = item;
}
catch (const pt::ptree_error& e) {
VLOG(1) << "Error parsing " << it.first << " from " << path << ": "
VLOG(3) << "Error parsing " << it.first << " from " << path << ": "
<< e.what();
}
r[it.second] = item;
}
for (const auto& it : kLaunchdTopLevelArrayKeys) {