Compare commits

...

9 Commits

Author SHA1 Message Date
Ondřej Surý
7b123a1a03 fixup! WIP: Use RCU for ADB 2024-02-26 23:23:20 +01:00
Ondřej Surý
cb86b237a3 WIP: Use RCU for ADB 2024-02-26 23:20:52 +01:00
Ondřej Surý
0c94e37365 WIP: Don't update {names,entries}_last_update if there's more purging to do 2024-02-26 09:30:56 +01:00
Ondřej Surý
b174607ea7 Add CHANGES and release note for [GL #4591] 2024-02-26 09:23:16 +01:00
Ondřej Surý
7177594427 Make the TTL-based cleaning more aggressive
It was discovered that the TTL-based cleaning could build up
a significant backlog of the rdataset headers during the periods where
the top of the TTL heap isn't expired yet.  Make the TTL-based cleaning
more aggressive by cleaning more headers from the heap when we are
adding new header into the RBTDB.
2024-02-26 09:22:56 +01:00
Ondřej Surý
6bbf6d4838 Remove expired rdataset headers from the heap
It was discovered that an expired header could sit on top of the heap
a little longer than desireable.  Remove expired headers (headers with
rdh_ttl set to 0) from the heap completely, so they don't block the next
TTL-based cleaning.
2024-02-26 09:22:56 +01:00
Ondřej Surý
f3682fbf49 Add CHANGES and release note for [GL #4596] 2024-02-26 09:22:54 +01:00
Ondřej Surý
d5694e2e3b Simplify the parent cleaning in the prune_tree() mechanism
Instead of juggling with node locks in a cycle, cleanup the node we are
just pruning and send any the parent that's also subject to the pruning
to the prune tree via normal way (e.g. enqueue pruning on the parent).

This simplifies the code and also spreads the pruning load across more
event loop ticks which is better for lock contention as less things run
in a tight loop.
2024-02-26 09:22:33 +01:00
Ondřej Surý
57b2ae78a3 Remove the contention when pruning RBTDB nodes
In #4383, we reported:

    In some older BIND 9 branches, the extra queuing overhead eliminated by
    this change could be remotely exploited to cause excessive memory use.
    Due to architectural shift, this branch is not vulnerable to that issue,
    but applying the fix to the latter is nevertheless deemed prudent for
    consistency and to make the code future-proof.

The single queue for pruning the nodes increased the lock contention to
the level where cleaning up nodes from the RBTDB would take too long and
the memory that the cache was using would grow indefinitely.

This commit reverts the change to the pruning mechanism as the 9.18+
release was not affected by affor mentioned problem with queueing
overhead.
2024-02-26 09:22:33 +01:00
7 changed files with 483 additions and 419 deletions

View File

@@ -1,3 +1,12 @@
6353. [bug] Improve the TTL-based cleaning by removing the expired
headers from the heap, so they don't block the next
cleaning round and clean more than a single item for
each new addition to the RBTDB. [GL #4591]
6352. [bug] Revert change introduced in fix to [GL #4383] that
converted multiple pruning events into a single pruning
queue. [GL #4596]
6351. [protocol] Support for the RESINFO record type has been added.
[GL #4413]

View File

@@ -70,6 +70,17 @@ Bug Fixes
ISC would like to thank to Jinmei Tatuya from Infoblox for bringing
this issue to our attention.
- A TTL-based cache cleaning was found to be ineffective to a level
that a memory used by the cache could grow indefinitely until
the configure ``max-cache-size`` limit was reached. This has
been fixed. :gl:`#4596`
- The TTL-based cleaning of the cached DNS records was ineffective
cleaning less records from the cache than adding over the time.
This could result in a significant backlog of DNS records to be
cleaned which could result in memory growth and ultimately triggering
overmem LRU-based cleaning that's more aggressive, but also slower.
:gl:`#4591`
Known Issues
~~~~~~~~~~~~

File diff suppressed because it is too large Load Diff

View File

@@ -112,12 +112,6 @@ struct dns_rbtnode {
*/
ISC_LINK(dns_rbtnode_t) deadlink;
/*%
* This linked list is used to store nodes from which tree pruning can
* be started.
*/
ISC_LINK(dns_rbtnode_t) prunelink;
/*@{*/
/*!
* These values are used in the RBT DB implementation. The appropriate

View File

@@ -1406,7 +1406,6 @@ rbtnode_new(isc_mem_t *mctx, const dns_name_t *name) {
};
ISC_LINK_INIT(node, deadlink);
ISC_LINK_INIT(node, prunelink);
isc_refcount_init(&node->references, 0);

View File

@@ -337,6 +337,12 @@ dns__rbtdb_setttl(dns_slabheader_t *header, dns_ttl_t newttl) {
} else {
isc_heap_decreased(header->heap, header->heap_index);
}
if (newttl == 0) {
INSIST(header->heap_index != 0);
isc_heap_delete(header->heap, header->heap_index);
header->heap_index = 0;
}
}
static bool
@@ -467,7 +473,6 @@ free_rbtdb(dns_rbtdb_t *rbtdb, bool log) {
unsigned int i;
isc_result_t result;
char buf[DNS_NAME_FORMATSIZE];
dns_rbtnode_t *node = NULL;
dns_rbt_t **treep = NULL;
isc_time_t start;
@@ -490,6 +495,8 @@ free_rbtdb(dns_rbtdb_t *rbtdb, bool log) {
* the overhead of unlinking all nodes here should be negligible.
*/
for (i = 0; i < rbtdb->node_lock_count; i++) {
dns_rbtnode_t *node = NULL;
node = ISC_LIST_HEAD(rbtdb->deadnodes[i]);
while (node != NULL) {
ISC_LIST_UNLINK(rbtdb->deadnodes[i], node, deadlink);
@@ -497,12 +504,6 @@ free_rbtdb(dns_rbtdb_t *rbtdb, bool log) {
}
}
node = ISC_LIST_HEAD(rbtdb->prunenodes);
while (node != NULL) {
ISC_LIST_UNLINK(rbtdb->prunenodes, node, prunelink);
node = ISC_LIST_HEAD(rbtdb->prunenodes);
}
rbtdb->quantum = (rbtdb->loop != NULL) ? 100 : 0;
for (;;) {
@@ -1139,10 +1140,11 @@ delete_node(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node) {
*/
void
dns__rbtdb_newref(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node,
isc_rwlocktype_t locktype DNS__DB_FLARG) {
isc_rwlocktype_t nlocktype DNS__DB_FLARG) {
uint_fast32_t refs;
if (locktype == isc_rwlocktype_write && ISC_LINK_LINKED(node, deadlink))
if (nlocktype == isc_rwlocktype_write &&
ISC_LINK_LINKED(node, deadlink))
{
ISC_LIST_UNLINK(rbtdb->deadnodes[node->locknum], node,
deadlink);
@@ -1181,26 +1183,16 @@ is_leaf(dns_rbtnode_t *node) {
node->left == NULL && node->right == NULL);
}
/*%
* The tree lock must be held when this function is called as it reads and
* updates rbtdb->prunenodes.
*/
static void
send_to_prune_tree(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node,
isc_rwlocktype_t locktype DNS__DB_FLARG) {
bool pruning_queued = (ISC_LIST_HEAD(rbtdb->prunenodes) != NULL);
isc_rwlocktype_t nlocktype DNS__DB_FLARG) {
prune_t *prune = isc_mem_get(rbtdb->common.mctx, sizeof(*prune));
*prune = (prune_t){ .node = node };
INSIST(locktype == isc_rwlocktype_write);
dns_db_attach((dns_db_t *)rbtdb, &prune->db);
dns__rbtdb_newref(rbtdb, node, nlocktype DNS__DB_FLARG_PASS);
dns__rbtdb_newref(rbtdb, node, locktype DNS__DB_FLARG_PASS);
INSIST(!ISC_LINK_LINKED(node, prunelink));
ISC_LIST_APPEND(rbtdb->prunenodes, node, prunelink);
if (!pruning_queued) {
dns_db_t *db = NULL;
dns_db_attach((dns_db_t *)rbtdb, &db);
isc_async_run(rbtdb->loop, prune_tree, db);
}
isc_async_run(rbtdb->loop, prune_tree, prune);
}
/*%
@@ -1450,10 +1442,6 @@ dns__rbtdb_decref(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node,
#undef KEEP_NODE
if (write_locked) {
/*
* We can now delete the node.
*/
/*
* If this node is the only one in the level it's in, deleting
* this node may recursively make its parent the only node in
@@ -1470,12 +1458,14 @@ dns__rbtdb_decref(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node,
* it's their responsibility to purge stale leaves (e.g. by
* periodic walk-through).
*/
if (!pruning && is_leaf(node) && rbtdb->loop != NULL) {
send_to_prune_tree(
rbtdb, node,
isc_rwlocktype_write DNS__DB_FLARG_PASS);
send_to_prune_tree(rbtdb, node,
*nlocktypep DNS__DB_FLARG_PASS);
no_reference = false;
} else {
/* We can now delete the node. */
delete_node(rbtdb, node);
}
} else {
@@ -1498,83 +1488,47 @@ restore_locks:
}
/*
* Prune the tree by recursively cleaning up single leaves. Go through all
* nodes stored in the rbtdb->prunenodes list; for each of them, in the worst
* case, it will be necessary to traverse a number of tree levels equal to the
* maximum legal number of domain name labels (127); in practice, the number of
* tree levels to traverse will virtually always be much smaller (a few levels
* at most). While holding the tree lock throughout this entire operation is
* less than ideal, so is splitting the latter up by queueing a separate
* prune_tree() run for each node to start pruning from (as queueing requires
* allocating memory and can therefore potentially be exploited to exhaust
* available memory). Also note that actually freeing up the memory used by
* RBTDB nodes (which is what this function does) is essential to keeping cache
* memory use in check, so since the tree lock needs to be acquired anyway,
* freeing as many nodes as possible before the tree lock gets released is
* prudent.
* Prune the tree by recursively cleaning-up single leaves. In the worst
* case, the number of iteration is the number of tree levels, which is at
* most the maximum number of domain name labels, i.e, 127. In practice, this
* should be much smaller (only a few times), and even the worst case would be
* acceptable for a single event.
*/
static void
prune_tree(void *arg) {
dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)arg;
dns_rbtnode_t *node = NULL;
prune_t *prune = (prune_t *)arg;
dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)prune->db;
dns_rbtnode_t *node = prune->node;
dns_rbtnode_t *parent = NULL;
unsigned int locknum;
isc_rwlocktype_t tlocktype = isc_rwlocktype_none;
isc_rwlocktype_t nlocktype = isc_rwlocktype_none;
isc_mem_put(rbtdb->common.mctx, prune, sizeof(*prune));
TREE_WRLOCK(&rbtdb->tree_lock, &tlocktype);
locknum = node->locknum;
NODE_WRLOCK(&rbtdb->node_locks[locknum].lock, &nlocktype);
while ((node = ISC_LIST_HEAD(rbtdb->prunenodes)) != NULL) {
locknum = node->locknum;
NODE_WRLOCK(&rbtdb->node_locks[locknum].lock, &nlocktype);
do {
if (ISC_LINK_LINKED(node, prunelink)) {
ISC_LIST_UNLINK(rbtdb->prunenodes, node,
prunelink);
}
parent = node->parent;
dns__rbtdb_decref(rbtdb, node, 0, &nlocktype, &tlocktype, true,
true DNS__DB_FILELINE);
parent = node->parent;
dns__rbtdb_decref(rbtdb, node, 0, &nlocktype,
&tlocktype, true,
true DNS__DB_FILELINE);
NODE_UNLOCK(&rbtdb->node_locks[locknum].lock, &nlocktype);
if (parent != NULL && parent->down == NULL) {
/*
* node was the only down child of the parent
* and has just been removed. We'll then need
* to examine the parent. Keep the lock if
* possible; otherwise, release the old lock and
* acquire one for the parent.
*/
if (parent->locknum != locknum) {
NODE_UNLOCK(
&rbtdb->node_locks[locknum].lock,
&nlocktype);
locknum = parent->locknum;
NODE_WRLOCK(
&rbtdb->node_locks[locknum].lock,
&nlocktype);
}
/*
* We need to gain a reference to the node
* before decrementing it in the next iteration.
*/
if (ISC_LINK_LINKED(parent, deadlink)) {
ISC_LIST_UNLINK(
rbtdb->deadnodes[locknum],
parent, deadlink);
}
dns__rbtdb_newref(rbtdb, parent,
nlocktype DNS__DB_FILELINE);
} else {
parent = NULL;
}
node = parent;
} while (node != NULL);
NODE_UNLOCK(&rbtdb->node_locks[locknum].lock, &nlocktype);
if (parent != NULL && parent->down == NULL) {
/*
* node was the only down child of the parent and has
* just been removed. Send the parent to the prune
* tree to be examined as well.
*/
NODE_WRLOCK(&rbtdb->node_locks[parent->locknum].lock,
&nlocktype);
send_to_prune_tree(rbtdb, parent, nlocktype);
NODE_UNLOCK(&rbtdb->node_locks[parent->locknum].lock,
&nlocktype);
}
TREE_UNLOCK(&rbtdb->tree_lock, &tlocktype);
dns_db_detach((dns_db_t **)&rbtdb);
@@ -3219,6 +3173,11 @@ cleanup:
return (result);
}
static void
expire_ttl_headers(dns_rbtdb_t *rbtdb, unsigned int locknum, size_t purgecount,
isc_rwlocktype_t *tlocktypep,
isc_stdtime_t now DNS__DB_FLARG);
isc_result_t
dns__rbtdb_addrdataset(dns_db_t *db, dns_dbnode_t *node,
dns_dbversion_t *version, isc_stdtime_t now,
@@ -3229,7 +3188,6 @@ dns__rbtdb_addrdataset(dns_db_t *db, dns_dbnode_t *node,
dns_rbtdb_version_t *rbtversion = version;
isc_region_t region;
dns_slabheader_t *newheader = NULL;
dns_slabheader_t *header = NULL;
isc_result_t result;
bool delegating;
bool newnsec;
@@ -3401,14 +3359,8 @@ dns__rbtdb_addrdataset(dns_db_t *db, dns_dbnode_t *node,
rbtnode->locknum DNS__DB_FLARG_PASS);
}
header = isc_heap_element(rbtdb->heaps[rbtnode->locknum], 1);
if (header != NULL && header->ttl + STALE_TTL(header, rbtdb) <
now - RBTDB_VIRTUAL)
{
dns__cacherbt_expireheader(
header, &tlocktype,
dns_expire_ttl DNS__DB_FLARG_PASS);
}
expire_ttl_headers(rbtdb, rbtnode->locknum, 5, &tlocktype,
now DNS__DB_FLARG_PASS);
/*
* If we've been holding a write lock on the tree just for
@@ -3951,8 +3903,6 @@ dns__rbtdb_create(isc_mem_t *mctx, const dns_name_t *origin, dns_dbtype_t type,
ISC_LIST_INIT(rbtdb->deadnodes[i]);
}
ISC_LIST_INIT(rbtdb->prunenodes);
rbtdb->active = rbtdb->node_lock_count;
for (i = 0; i < (int)(rbtdb->node_lock_count); i++) {
@@ -4937,3 +4887,28 @@ dns__rbtdb_deletedata(dns_db_t *db ISC_ATTR_UNUSED,
}
}
}
static void
expire_ttl_headers(dns_rbtdb_t *rbtdb, unsigned int locknum, size_t purgecount,
isc_rwlocktype_t *tlocktypep,
isc_stdtime_t now DNS__DB_FLARG) {
dns_slabheader_t *header = NULL;
for (size_t i = 0; i < purgecount; i++) {
header = isc_heap_element(rbtdb->heaps[locknum], 1);
if (header == NULL) {
/* No more work; exit cleaning */
return;
}
if (header->ttl + STALE_TTL(header, rbtdb) >=
now - RBTDB_VIRTUAL)
{
/* No more work; exit cleaning */
return;
}
dns__cacherbt_expireheader(header, tlocktypep,
dns_expire_ttl DNS__DB_FLARG_PASS);
}
}

View File

@@ -152,10 +152,6 @@ struct dns_rbtdb {
*/
dns_rbtnodelist_t *deadnodes;
/* List of nodes from which recursive tree pruning can be started from.
* Locked by tree_lock. */
dns_rbtnodelist_t prunenodes;
/*
* Heaps. These are used for TTL based expiry in a cache,
* or for zone resigning in a zone DB. hmctx is the memory
@@ -202,6 +198,14 @@ typedef struct {
isc_stdtime_t now;
} rbtdb_load_t;
/*%
* Prune context
*/
typedef struct {
dns_db_t *db;
dns_rbtnode_t *node;
} prune_t;
extern dns_dbmethods_t dns__rbtdb_zonemethods;
extern dns_dbmethods_t dns__rbtdb_cachemethods;