Bug 4157

Summary: unbound: DNS resolver process gradually consumes RAM until none left, stops resolving
Product: unbound Reporter: jp <s0m3f00l>
Component: serverAssignee: unbound team <unbound-team>
Status: RESOLVED DUPLICATE    
Severity: critical CC: cathya, ericluehrsen, s0m3f00l, wouter
Priority: P5    
Version: 1.7.3   
Hardware: Other   
OS: Linux   
URL: https://github.com/openwrt/packages/issues/6957#issuecomment-423476221

Description jp 2018-09-21 12:30:13 CEST
Environment: arm_cortex-a9_vfpv3, wrt3200acm, OpenWrt 18.06.0 r7188-b0b5c64c22 OR any OpenWrt install

Description: the unbound DNS resolver gradually consumes RAM over the course of up to three weeks, until all 512M are consumed at which point DNS resolution stops working.

root@link:~# cat /etc/openwrt_release
DISTRIB_ID='OpenWrt'
DISTRIB_RELEASE='18.06.0'
DISTRIB_REVISION='r7188-b0b5c64c22'
DISTRIB_TARGET='mvebu/cortexa9'
DISTRIB_ARCH='arm_cortex-a9_vfpv3'
DISTRIB_DESCRIPTION='OpenWrt 18.06.0 r7188-b0b5c64c22'
DISTRIB_TAINTS=''
root@link:~# 

root@link:~# opkg info unbound
Package: unbound
Version: 1.7.3-2
Depends: libc, libopenssl, libunbound
Status: install ok installed
Architecture: arm_cortex-a9_vfpv3
Conffiles:
 /etc/config/unbound efedbbe21c74a37135e98d9344aba20a23f69a7dc3b581659f1ceabdb744c02a
 /etc/unbound/unbound.conf 4ec559b4a9331fa1af4aea21d4dfffe1111cb107aabc256048c363b29a560da2
 /etc/unbound/unbound_ext.conf e8b6b7e5d1a92385ca39ac47e9c763a44411ca17e294a5caee3c09117c486755
 /etc/unbound/unbound_srv.conf 62b6e7d5d67d146f98c1a4687742a7929e53217f98e9068bb547ca4fbbfd984b
Installed-Time: 1533529446

root@link:~# uptime
 13:49:53 up 14 days, 21:25,  load average: 0.00, 0.00, 0.00
root@link:~# grep ^Vm /proc/$(pidof unbound)/status
VmPeak:   315772 kB
VmSize:   315772 kB
VmLck:         0 kB
VmPin:         0 kB
VmHWM:    268760 kB
VmRSS:    268760 kB
VmData:   312472 kB
VmStk:       132 kB
VmExe:       172 kB
VmLib:      2904 kB
VmPTE:       316 kB
VmPMD:         0 kB
VmSwap:        0 kB

root@link:~# grep -v -e '^#' -e '^ +#' -e '^\t#' -e '^$' /etc/unbound/*.conf
/etc/unbound/unbound.conf:server:
/etc/unbound/unbound.conf:      verbosity: 1
/etc/unbound/unbound.conf:      username: "unbound"
/etc/unbound/unbound.conf:      directory: "/var/lib/unbound"
/etc/unbound/unbound.conf:      chroot: "/var/lib/unbound"
/etc/unbound/unbound.conf:      pidfile: "/var/run/unbound.pid"
/etc/unbound/unbound.conf:      num-threads: 1
/etc/unbound/unbound.conf:      msg-cache-slabs: 1
/etc/unbound/unbound.conf:      rrset-cache-slabs: 1
/etc/unbound/unbound.conf:      infra-cache-slabs: 1
/etc/unbound/unbound.conf:      key-cache-slabs: 1
/etc/unbound/unbound.conf:      interface: 0.0.0.0
/etc/unbound/unbound.conf:      interface: ::0
/etc/unbound/unbound.conf:      access-control: 0.0.0.0/0 allow
/etc/unbound/unbound.conf:      access-control: ::0/0 allow
/etc/unbound/unbound.conf:      outgoing-num-tcp: 1
/etc/unbound/unbound.conf:      incoming-num-tcp: 1
/etc/unbound/unbound.conf:      outgoing-port-permit: "10240-65335"
/etc/unbound/unbound.conf:      outgoing-range: 60
/etc/unbound/unbound.conf:      num-queries-per-thread: 30
/etc/unbound/unbound.conf:      msg-buffer-size: 8192
/etc/unbound/unbound.conf:      infra-cache-numhosts: 200
/etc/unbound/unbound.conf:      msg-cache-size: 100k
/etc/unbound/unbound.conf:      rrset-cache-size: 100k
/etc/unbound/unbound.conf:      key-cache-size: 100k
/etc/unbound/unbound.conf:      neg-cache-size: 10k
/etc/unbound/unbound.conf:      target-fetch-policy: "2 1 0 0 0 0"
/etc/unbound/unbound.conf:      harden-large-queries: yes
/etc/unbound/unbound.conf:      harden-short-bufsize: yes
/etc/unbound/unbound.conf:
/etc/unbound/unbound.conf:python:
/etc/unbound/unbound.conf:remote-control:
/etc/unbound/unbound_ext.conf:forward-zone:
/etc/unbound/unbound_ext.conf:  name: "."
/etc/unbound/unbound_ext.conf:  forward-addr: 1.0.0.1@853
/etc/unbound/unbound_ext.conf:  forward-ssl-upstream: yes
/etc/unbound/unbound_srv.conf:use-syslog: no
/etc/unbound/unbound_srv.conf:logfile: /mnt/sda1/unbound_log
/etc/unbound/unbound_srv.conf:log-queries: yes
/etc/unbound/unbound_srv.conf:log-replies: yes

root@link:~# unbound-control stats_noreset
thread0.num.queries=158407
thread0.num.queries_ip_ratelimited=0
thread0.num.cachehits=15381
thread0.num.cachemiss=143026
thread0.num.prefetch=0
thread0.num.zero_ttl=0
thread0.num.recursivereplies=143025
thread0.requestlist.avg=0.536707
thread0.requestlist.max=11
thread0.requestlist.overwritten=0
thread0.requestlist.exceeded=0
thread0.requestlist.current.all=1
thread0.requestlist.current.user=1
thread0.recursion.time.avg=0.024748
thread0.recursion.time.median=0.0290863
thread0.tcpusage=0
total.num.queries=158407
total.num.queries_ip_ratelimited=0
total.num.cachehits=15381
total.num.cachemiss=143026
total.num.prefetch=0
total.num.zero_ttl=0
total.num.recursivereplies=143025
total.requestlist.avg=0.536707
total.requestlist.max=11
total.requestlist.overwritten=0
total.requestlist.exceeded=0
total.requestlist.current.all=1
total.requestlist.current.user=1
total.recursion.time.avg=0.024748
total.recursion.time.median=0.0290863
total.tcpusage=0
time.now=1536183628.523561
time.up=1289732.424684
time.elapsed=1289732.424684
mem.cache.rrset=1710898
mem.cache.message=1272729
mem.mod.iterator=16504
mem.mod.validator=0
mem.mod.respip=0
histogram.000000.000000.to.000000.000001=3748
histogram.000000.000001.to.000000.000002=0
histogram.000000.000002.to.000000.000004=0
histogram.000000.000004.to.000000.000008=0
histogram.000000.000008.to.000000.000016=0
histogram.000000.000016.to.000000.000032=0
histogram.000000.000032.to.000000.000064=1
histogram.000000.000064.to.000000.000128=150
histogram.000000.000128.to.000000.000256=155
histogram.000000.000256.to.000000.000512=259
histogram.000000.000512.to.000000.001024=400
histogram.000000.001024.to.000000.002048=595
histogram.000000.002048.to.000000.004096=1501
histogram.000000.004096.to.000000.008192=9296
histogram.000000.008192.to.000000.016384=1960
histogram.000000.016384.to.000000.032768=68939
histogram.000000.032768.to.000000.065536=34762
histogram.000000.065536.to.000000.131072=14711
histogram.000000.131072.to.000000.262144=3451
histogram.000000.262144.to.000000.524288=1787
histogram.000000.524288.to.000001.000000=903
histogram.000001.000000.to.000002.000000=270
histogram.000002.000000.to.000004.000000=98
histogram.000004.000000.to.000008.000000=20
histogram.000008.000000.to.000016.000000=15
histogram.000016.000000.to.000032.000000=4
histogram.000032.000000.to.000064.000000=0
histogram.000064.000000.to.000128.000000=0
histogram.000128.000000.to.000256.000000=0
histogram.000256.000000.to.000512.000000=0
histogram.000512.000000.to.001024.000000=0
histogram.001024.000000.to.002048.000000=0
histogram.002048.000000.to.004096.000000=0
histogram.004096.000000.to.008192.000000=0
histogram.008192.000000.to.016384.000000=0
histogram.016384.000000.to.032768.000000=0
histogram.032768.000000.to.065536.000000=0
histogram.065536.000000.to.131072.000000=0
histogram.131072.000000.to.262144.000000=0
histogram.262144.000000.to.524288.000000=0
num.query.type.A=132061
num.query.type.PTR=4393
num.query.type.MX=1
num.query.type.TXT=1774
num.query.type.AAAA=20040
num.query.type.SRV=132
num.query.type.DS=3
num.query.type.DNSKEY=3
num.query.class.IN=158407
num.query.opcode.QUERY=158407
num.query.tcp=0
num.query.tcpout=157574
num.query.ipv6=0
num.query.flags.QR=0
num.query.flags.AA=0
num.query.flags.TC=0
num.query.flags.RD=158407
num.query.flags.RA=0
num.query.flags.Z=0
num.query.flags.AD=0
num.query.flags.CD=6
num.query.edns.present=1741
num.query.edns.DO=6
num.answer.rcode.NOERROR=154408
num.answer.rcode.FORMERR=0
num.answer.rcode.SERVFAIL=938
num.answer.rcode.NXDOMAIN=3060
num.answer.rcode.NOTIMPL=0
num.answer.rcode.REFUSED=0
num.answer.rcode.nodata=17092
num.query.ratelimited=0
num.answer.secure=0
num.answer.bogus=0
num.rrset.bogus=0
num.query.aggressive.NOERROR=0
num.query.aggressive.NXDOMAIN=0
unwanted.queries=0
unwanted.replies=0
msg.cache.count=6741
rrset.cache.count=7397
infra.cache.count=1
key.cache.count=0
num.query.authzone.up=0
num.query.authzone.down=0

root@link:~# uci export unbound
package unbound

config unbound
        option add_extra_dns '0'
        option add_local_fqdn '1'
        option add_wan_fqdn '0'
        option dhcp_link 'none'
        option dns64 '0'
        option domain_type 'static'
        option edns_size '1280'
        option hide_binddata '1'
        option localservice '1'
        option manual_conf '0'
        option protocol 'default'
        option rebind_localhost '0'
        option rebind_protection '1'
        option recursion 'default'
        option resource 'default'
        option root_age '9'
        option ttl_min '120'
        option validator '0'
        option extended_luci '1'
        option listen_port '1053'
        option extended_stats '1'
        option domain '<redacted>'
        option enabled '1'
        option unbound_control '1'
        list trigger_interface 'lan'
        list trigger_interface 'wan'

root@link:~#


FURTHER TESTING

Well, after some 8 hours of testing, I think we can conclude that, unless all SSL libraries tested have a memory leak, the culprit is unbound. Here's my results from the 3 processes handling the same queries for 8 hours:
Test started at 2018-09-12 03:23 GMT

/proc/28396/stat
28396 (unbound-libress) S 1 28395 28395 0 -1 4194624 881 0 0 0 14 4 0 0 20 0 1 0 91059687 11288576 1316 2147483647 4194304 4371120 2143577536 0 0 0 0 4096 16391 1 0 0 18 0 0 0 0 0 0 4440028 4451092 9265152 2143579619 2143579678 2143579678 2143580129 0
grep ^Vm /proc/28396/status
VmPeak:    11024 kB
VmSize:    11024 kB
VmLck:         0 kB
VmPin:         0 kB
VmHWM:      5264 kB
VmRSS:      5264 kB
VmData:     7076 kB
VmStk:       136 kB
VmExe:       176 kB
VmLib:      3632 kB
VmPTE:        24 kB
VmPMD:         0 kB
VmSwap:        0 kB
/proc/28393/stat
28393 (unbound-openssl) S 1 28392 28392 0 -1 4194624 868 0 0 0 14 8 0 0 20 0 1 0 91059479 14295040 1323 2147483647 4194304 4363632 2141325520 0 0 0 0 4096 16391 1 0 0 18 0 0 0 0 0 0 4431836 4443055 7643136 2141326744 2141326803 2141326803 2141327338 0
grep ^Vm /proc/28393/status
VmPeak:    13960 kB
VmSize:    13960 kB
VmLck:         0 kB
VmPin:         0 kB
VmHWM:      5292 kB
VmRSS:      5292 kB
VmData:     7032 kB
VmStk:       136 kB
VmExe:       168 kB
VmLib:      6620 kB
VmPTE:        28 kB
VmPMD:         0 kB
VmSwap:        0 kB
/proc/28390/stat
28390 (unbound) S 1 28389 28389 0 -1 4194624 877 0 0 0 16 1 0 0 20 0 1 0 91059265 11288576 1318 2147483647 4194304 4371120 2146959696 0 0 0 0 4096 16391 1 0 0 18 0 0 0 0 0 0 4440028 4451092 8785920 2146962912 2146962959 2146962959 2146963434 0
grep ^Vm /proc/28390/status
VmPeak:    11024 kB
VmSize:    11024 kB
VmLck:         0 kB
VmPin:         0 kB
VmHWM:      5272 kB
VmRSS:      5272 kB
VmData:     7076 kB
VmStk:       136 kB
VmExe:       176 kB
VmLib:      3632 kB
VmPTE:        24 kB
VmPMD:         0 kB
VmSwap:        0 kB

As of 2018-09-12 11:45 GMT

/proc/28396/stat
28396 (unbound-libress) S 1 28395 28395 0 -1 4194624 3537 0 0 0 50757 3411 0 0 20 0 1 0 91059687 20148224 3630 2147483647 4194304 4371120 2143577536 0 0 0 0 4096 16391 1 0 0 18 0 0 0 0 0 0 4440028 4451092 9265152 2143579619 2143579678 2143579678 2143580129 0
grep ^Vm /proc/28396/status
VmPeak:    19676 kB
VmSize:    19676 kB
VmLck:         0 kB
VmPin:         0 kB
VmHWM:     14520 kB
VmRSS:     14520 kB
VmData:    15728 kB
VmStk:       136 kB
VmExe:       176 kB
VmLib:      3632 kB
VmPTE:        32 kB
VmPMD:         0 kB
VmSwap:        0 kB
/proc/28393/stat
28393 (unbound-openssl) S 1 28392 28392 0 -1 4194624 3694 0 0 0 32473 2955 0 0 20 0 1 0 91059479 26906624 4586 2147483647 4194304 4363632 2141325520 0 0 0 0 4096 16391 1 0 0 18 0 0 0 0 0 0 4431836 4443055 7643136 2141326744 2141326803 2141326803 2141327338 0
grep ^Vm /proc/28393/status
VmPeak:    26276 kB
VmSize:    26276 kB
VmLck:         0 kB
VmPin:         0 kB
VmHWM:     18344 kB
VmRSS:     18344 kB
VmData:    19348 kB
VmStk:       136 kB
VmExe:       168 kB
VmLib:      6620 kB
VmPTE:        40 kB
VmPMD:         0 kB
VmSwap:        0 kB
/proc/28390/stat
28390 (unbound) S 1 28389 28389 0 -1 4194624 967 0 0 0 1987 2082 0 0 20 0 1 0 91059265 11599872 1402 2147483647 4194304 4371120 2146959696 0 0 0 0 4096 16391 1 0 0 18 0 0 0 0 0 0 4440028 4451092 8785920 2146962912 2146962959 2146962959 2146963434 0
grep ^Vm /proc/28390/status
VmPeak:    11328 kB
VmSize:    11328 kB
VmLck:         0 kB
VmPin:         0 kB
VmHWM:      5608 kB
VmRSS:      5608 kB
VmData:     7380 kB
VmStk:       136 kB
VmExe:       176 kB
VmLib:      3632 kB
VmPTE:        24 kB
VmPMD:         0 kB
VmSwap:        0 kB
Comment 1 jp 2018-09-21 12:31:59 CEST
Appears to be a memory leak?

https://github.com/openwrt/packages/issues/6957
Comment 2 Wouter Wijngaards 2018-09-21 13:14:13 CEST
Hi,

This is a duplicate of 4149 that is tracked here.  Your information is however very useful!  Please post comments on that bug (or here if you want that), I am noting down that it is a duplicate for bug tracking reasons, not because your report is not useful.

When I run unbound, with several versions of openssl, under valgrind, I see no memory leakage.  Nothing at all.  Your requestlist usage and the other bugreport report says that the requestlist growth is not the problem.  Unbound's caches also have not grown.  If there is some sort of leakage, it must be somewhere.

Can you run it with valgrind?  And see if it can report where the memory leaks originate from.  Please run first very briefly and see what that reports, that may just be missing cleanups or something (that may also depend on platform, but for me I compiled with --disable-gost and it removed some missing cleanup, and for 1.1.1 it was completely clean, for 1.0.2p it had some missing cleanup items).  Then after more queries, for me the cleanup items are the same, or with 1.1.1 everything is still clean perfectly.  But if you run it with valgrind (or another memory checker if that is what you want), and see if it has growing memory and can report where that happened?

I'll make a post on 4149 as well about this item.

Best regards, Wouter

*** This bug has been marked as a duplicate of bug 4149 ***
Comment 3 Eric Luehrsen 2018-09-21 13:56:05 CEST
Confirm duplicate.