Crash Report - Fatal trap 12: page fault while in kernel mode (lsof)

Be-Bop-Bo

I have been experiencing random crashes are at least two separate systems, they are the same hardware setup. I have done a little looking around, but I will be honest in saying I am not a FreeBSD expert... I have been putting this off for a bit but really need to get to the bottom of it. Any help is super appreciated.

TopTon
Intel(R) N100
Current: 2015 MHz, Max: 806 MHz
4 CPUs: 1 package(s) x 4 core(s)
AES-NI CPU Crypto: Yes (active)
QAT Crypto: No
2.7.2-RELEASE (amd64)
4x Intel i226-v

Avg Temp ~100F
RAM - ~50% of 8GB
HDD - 1% used of 180G (zfs)

Device 01:
Fatal trap 12: page fault while in kernel mode
cpuid = 2; apic id = 04
fault virtual address = 0x40
fault code = supervisor read data, page not present
instruction pointer = 0x20:0xffffffff80ca9140
stack pointer = 0x28:0xfffffe00b16fc730
frame pointer = 0x28:0xfffffe00b16fc730
code segment = base 0x0, limit 0xfffff, type 0x1b
= DPL 0, pres 1, long 1, def32 0, gran 1
processor eflags = interrupt enabled, resume, IOPL = 0
current process = 49398 (lsof)
rdi: fffff801f3150e00 rsi: fffff801b00beb00 rdx: 0000000000000000
rcx: ffffffff82d62a40 r8: 0000000000000002 r9: ffffffffffffffff
rax: 0000000000000000 rbx: fffffe00b16fcbe0 rbp: fffffe00b16fc730
r10: 0000000000000000 r11: fffffe00b413e1a0 r12: fffffe00b16fc7d8
r13: fffffe00b413dc80 r14: fffff801f3150e00 r15: fffff801f77a1540
trap number = 12
panic: page fault
cpuid = 2
time = 1742174470
KDB: enter: panic

db:0:kdb.enter.default>  show pcpu
cpuid        = 2
dynamic pcpu = 0xfffffe008ef67f80
curthread    = 0xfffffe00b413dc80: pid 49398 tid 100440 critnest 1 "lsof"
curpcb       = 0xfffffe00b413e1a0
fpcurthread  = 0xfffffe00b413dc80: pid 49398 "lsof"
idlethread   = 0xfffffe0011ee5560: tid 100005 "idle: cpu2"
self         = 0xffffffff84012000
curpmap      = 0xfffff800065afd38
tssp         = 0xffffffff84012384
rsp0         = 0xfffffe00b16fd000
kcr3         = 0xffffffffffffffff
ucr3         = 0xffffffffffffffff
scr3         = 0x0
gs32p        = 0xffffffff84012404
ldt          = 0xffffffff84012444
tss          = 0xffffffff84012434
curvnet      = 0xfffff800012004c0
db:0:kdb.enter.default>  bt
Tracing pid 49398 tid 100440 td 0xfffffe00b413dc80
kdb_enter() at kdb_enter+0x32/frame 0xfffffe00b16fc410
vpanic() at vpanic+0x163/frame 0xfffffe00b16fc540
panic() at panic+0x43/frame 0xfffffe00b16fc5a0
trap_fatal() at trap_fatal+0x40c/frame 0xfffffe00b16fc600
trap_pfault() at trap_pfault+0x4f/frame 0xfffffe00b16fc660
calltrap() at calltrap+0x8/frame 0xfffffe00b16fc660
--- trap 0xc, rip = 0xffffffff80ca9140, rsp = 0xfffffe00b16fc730, rbp = 0xfffffe00b16fc730 ---
prison_check() at prison_check+0x20/frame 0xfffffe00b16fc730
cr_canseeinpcb() at cr_canseeinpcb+0x19/frame 0xfffffe00b16fc760
tcp_pcblist() at tcp_pcblist+0x1f6/frame 0xfffffe00b16fcaf0
sysctl_root_handler_locked() at sysctl_root_handler_locked+0x90/frame 0xfffffe00b16fcb40
sysctl_root() at sysctl_root+0x216/frame 0xfffffe00b16fcbc0
userland_sysctl() at userland_sysctl+0x176/frame 0xfffffe00b16fcc70
kern___sysctlbyname() at kern___sysctlbyname+0x21d/frame 0xfffffe00b16fcdc0
sys___sysctlbyname() at sys___sysctlbyname+0x2d/frame 0xfffffe00b16fce00
amd64_syscall() at amd64_syscall+0x109/frame 0xfffffe00b16fcf30
fast_syscall_common() at fast_syscall_common+0xf8/frame 0xfffffe00b16fcf30
--- syscall (570, FreeBSD ELF64, __sysctlbyname), rip = 0x8222c326a, rsp = 0x82092cb88, rbp = 0x82092cbc0 ---
db:0:kdb.enter.default>  ps
  pid  ppid  pgrp   uid  state   wmesg   wchan               cmd
54279     1 39114     0  R                                   ping
53801 39669 39114     0  R       CPU 1                       telegraf
53600 39669 39114     0  S       select  0xfffff80120681540  ping
53581 39669 39114     0  R                                   telegraf
53308 39669 39114     0  D       sysctl  0xffffffff82c02d00  ps
52602 39669 39114     0  D       sysctl  0xffffffff82c02d00  lsof
52268 39669 39114     0  R                                   telegraf
52103 39669 39114     0  R       CPU 3                       pgrep
51807 39669 39114     0  D       sysctl  0xffffffff82c02d00  lsof
49398 39669 39114     0  R       CPU 2                       lsof
39669 39114 39114     0  R       (threaded)                  telegraf

Device 02:
Fatal trap 12: page fault while in kernel mode
cpuid = 2; apic id = 04
fault virtual address = 0x40
fault code = supervisor read data, page not present
instruction pointer = 0x20:0xffffffff80ca9140
stack pointer = 0x28:0xfffffe01057eb730
frame pointer = 0x28:0xfffffe01057eb730
code segment = base 0x0, limit 0xfffff, type 0x1b
= DPL 0, pres 1, long 1, def32 0, gran 1
processor eflags = interrupt enabled, resume, IOPL = 0
current process = 43259 (lsof)
rdi: fffff8021dd69700 rsi: fffff80193948400 rdx: 0000000000000000
rcx: ffffffff82d62a40 r8: 0000000000000002 r9: ffffffffffffffff
rax: 0000000000000000 rbx: fffffe01057ebbe0 rbp: fffffe01057eb730
r10: 00000a1944f62158 r11: fffffe011c81ec60 r12: fffffe01057eb7d8
r13: fffffe011c81e740 r14: fffff8021dd69700 r15: fffff802b52f3000
trap number = 12
panic: page fault
cpuid = 2
time = 1741776490
KDB: enter: panic

db:0:kdb.enter.default>  show pcpu
cpuid        = 2
dynamic pcpu = 0xfffffe009d500f80
curthread    = 0xfffffe011c81e740: pid 43259 tid 102977 critnest 1 "lsof"
curpcb       = 0xfffffe011c81ec60
fpcurthread  = 0xfffffe011c81e740: pid 43259 "lsof"
idlethread   = 0xfffffe0020490560: tid 100005 "idle: cpu2"
self         = 0xffffffff84012000
curpmap      = 0xfffff80447150398
tssp         = 0xffffffff84012384
rsp0         = 0xfffffe01057ec000
kcr3         = 0xffffffffffffffff
ucr3         = 0xffffffffffffffff
scr3         = 0x0
gs32p        = 0xffffffff84012404
ldt          = 0xffffffff84012444
tss          = 0xffffffff84012434
curvnet      = 0xfffff80001240480
db:0:kdb.enter.default>  bt
Tracing pid 43259 tid 102977 td 0xfffffe011c81e740
kdb_enter() at kdb_enter+0x32/frame 0xfffffe01057eb410
vpanic() at vpanic+0x163/frame 0xfffffe01057eb540
panic() at panic+0x43/frame 0xfffffe01057eb5a0
trap_fatal() at trap_fatal+0x40c/frame 0xfffffe01057eb600
trap_pfault() at trap_pfault+0x4f/frame 0xfffffe01057eb660
calltrap() at calltrap+0x8/frame 0xfffffe01057eb660
--- trap 0xc, rip = 0xffffffff80ca9140, rsp = 0xfffffe01057eb730, rbp = 0xfffffe01057eb730 ---
prison_check() at prison_check+0x20/frame 0xfffffe01057eb730
cr_canseeinpcb() at cr_canseeinpcb+0x19/frame 0xfffffe01057eb760
tcp_pcblist() at tcp_pcblist+0x1f6/frame 0xfffffe01057ebaf0
sysctl_root_handler_locked() at sysctl_root_handler_locked+0x90/frame 0xfffffe01057ebb40
sysctl_root() at sysctl_root+0x216/frame 0xfffffe01057ebbc0
userland_sysctl() at userland_sysctl+0x176/frame 0xfffffe01057ebc70
kern___sysctlbyname() at kern___sysctlbyname+0x21d/frame 0xfffffe01057ebdc0
sys___sysctlbyname() at sys___sysctlbyname+0x2d/frame 0xfffffe01057ebe00
amd64_syscall() at amd64_syscall+0x109/frame 0xfffffe01057ebf30
fast_syscall_common() at fast_syscall_common+0xf8/frame 0xfffffe01057ebf30
--- syscall (570, FreeBSD ELF64, __sysctlbyname), rip = 0x8239fd26a, rsp = 0x8208318e8, rbp = 0x820831920 ---
db:0:kdb.enter.default>  ps
  pid  ppid  pgrp   uid  state   wmesg   wchan               cmd
43867 86475 86226     0  R                                   telegraf
43692 86475 86226     0  R       CPU -1                      ping
43520 86475 86226     0  R                                   telegraf
43405 86475 86226     0  D       sysctl  0xffffffff82c02d00  lsof
43261 86475 86226     0  R                                   lsof
43259 86475 86226     0  R       CPU 2                       lsof
41994 31844   399     0  S       nanslp  0xffffffff83063d63  sleep
31844 98056   399     0  S       wait    0xfffffe002048e5c0  sh
 1431 64500   399     0  S       nanslp  0xffffffff83063d61  sleep
86475 86226 86226     0  R       (threaded)                  telegraf

stephenw10

Hmm, not a crash I've seen before. Are you running lsof manually to trigger it?

What packages do you have installed? lsof is not included in 2.7.2 by default.

Be-Bop-Bo

@stephenw10
I am not running it manually, so honestly I am not really sure what is using it. I do have a couple manual scripts running to populate some Grafana dashboards that collect a fair amount of stats, but I do not remember installing lsof for it's use. Reviewing the bash scripts I do not see it listed.

Running ps -ax does not show a lsof as of now

Packages:
Acme
apcupsd
iperf
nmap
ntopng
pfblockerng
service_watchdog
suricata
telegraf
wireguard

stephenw10

Hmm, it appears lsof is a dependency of Telegraf: https://github.com/pfsense/FreeBSD-ports/blob/devel/net-mgmt/pfSense-pkg-Telegraf/Makefile#L17

How do you have it configured?

Be-Bop-Bo

@stephenw10
Sure here is what I have. I will note that at the bottom there is a listed Github project, and files that should have been called; however, I did not replace them when I migrated to the Topton mini-pc from my old 1U atom that I had been using for the last +8 years.

But from what you have shown it should be localized to the Telegraf package, so that really helps.

[[inputs.net]]
  interfaces = ["igc0", "igc1", "igc2", "igc3","tun_wg0", "tun_wg1", "tun_wg2", "tun_wg3"]
[[inputs.conntrack]]
[[inputs.filestat]]
[[inputs.internal]]
[[inputs.interrupts]]
[[inputs.linux_sysctl_fs]]
[[inputs.net]]
[[inputs.net_response]]
  protocol = "tcp"
  address = "localhost:443"
[[inputs.netstat]]
[[inputs.nstat]]
[[inputs.procstat]]
  pattern = "."
  prefix = "pgrep_serviceprocess"

 [[inputs.dns_query]]
#   ## servers to query
#   servers = ["8.8.8.8"]
     servers = ["208.67.222.222"]


[[inputs.netstat]]
#   # no configuration

# Read metrics about swap memory usage
[[inputs.swap]]
  # no configuration

[[inputs.ping]]
#   ## Hosts to send ping packets to.
     urls = ["208.67.222.222"]
#
#   ## Method used for sending pings, can be either "exec" or "native".  When set
#   ## to "exec" the systems ping command will be executed.  When set to "native"
#   ## the plugin will send pings directly.
#   ##
#   ## While the default is "exec" for backwards compatibility, new deployments
#   ## are encouraged to use the "native" method for improved compatibility and
#   ## performance.
#   # method = "exec"
#
#   ## Number of ping packets to send per interval.  Corresponds to the "-c"
#   ## option of the ping command.
#   # count = 1
#
#   ## Time to wait between sending ping packets in seconds.  Operates like the
#   ## "-i" option of the ping command.
#   # ping_interval = 1.0
#
#   ## If set, the time to wait for a ping response in seconds.  Operates like
#   ## the "-W" option of the ping command.
#   # timeout = 1.0
#
#   ## If set, the total ping deadline, in seconds.  Operates like the -w option
#   ## of the ping command.
#   # deadline = 10
#
#   ## Interface or source address to send ping from.  Operates like the -I or -S
#   ## option of the ping command.
#   # interface = ""
#
#   ## Specify the ping executable binary.
#   # binary = "ping"
#
#   ## Arguments for ping command. When arguments is not empty, the command from
#   ## the binary option will be used and other options (ping_interval, timeout,
#   ## etc) will be ignored.
#   # arguments = ["-c", "3"]
#
#   ## Use only IPv6 addresses when resolving a hostname.
#   # ipv6 = false

####################
## GIT: https://github.com/VictorRobellini/pfSense-Dashboard
[[inputs.exec]]
   commands = [
     "/usr/local/bin/telegraf_pfinterface.php",
     "/usr/local/bin/telegraf_gateways.py",
      "/usr/local/bin/telegraf_pfifgw.php",
      "sh /usr/local/bin/telegraf_temperature.sh",
      "sh /usr/local/bin/telegraf_pinger_loss.sh"
   ]
   data_format = "influx"

[[inputs.logparser]]
  files = ["/var/log/pfblockerng/dnsbl.log"]
  from_beginning=true
  [inputs.logparser.grok]
    measurement = "dnsbl_log"
    patterns = ["^%{WORD:BlockType}-%{WORD:BlockSubType},%{SYSLOGTIMESTAMP:timestamp:ts-syslog},%{IPORHOST:destination:tag},%{IPORHOST:source:tag},%{GREEDYDATA:call},%{WORD:BlockMethod},%{WORD:BlockList},%{IPORHOST:tld:tag},%{WORD:DefinedList:tag},%{GREEDYDATA:hitormiss}"]
    timezone = "Local"
    [inputs.logparser.tags]
      value = "1"

[[inputs.logparser]]
    files = ["/var/log/pfblockerng/ip_block.log"]
    from_beginning=true
    [inputs.logparser.grok]
        measurement = "ip_block_log"
        patterns = ["^%{SYSLOGTIMESTAMP:timestamp:ts-syslog},%{NUMBER:TrackerID},%{GREEDYDATA:Interface},%{WORD:InterfaceName},%{WORD:action},%{NUMBER:IPVersion},%{NUMBER:ProtocolID},%{GREEDYDATA:Protocol},%{IPORHOST:SrcIP:tag},%{IPORHOST:DstIP:tag},%{NUMBER:SrcPort},%{NUMBER:DstPort},%{WORD:Dir},%{WORD:GeoIP:tag},%{GREEDYDATA:AliasName},%{GREEDYDATA:IPEvaluated},%{GREEDYDATA:FeedName:tag},%{HOSTNAME:ResolvedHostname},%{HOSTNAME:ClientHostname},%{GREEDYDATA:ASN},%{GREEDYDATA:DuplicateEventStatus}"]
        timezone = "Local"

[[inputs.unbound]]
  server = "127.0.0.1:953"
  binary = "/usr/local/bin/telegraf_unbound.sh"

stephenw10

Which of those are custom scripts you've imported?

Can you see how lsof is being called? Or disable that as a test?

Be-Bop-Bo

@stephenw10
This is the part of the config that called the custom scripts. But as they are not currently present I have removed this part of the config. Looking at the scrips I do not see any lsof reference.

The second device had that same configs, and after I removed it the box rebooted for some reason. Now I cannot get to it, because it sometimes does not want to bring up the WAN correctly. Once online, I will have to see it if there was a crash report or not.

####################
## GIT: https://github.com/VictorRobellini/pfSense-Dashboard
[[inputs.exec]]
   commands = [
     "/usr/local/bin/telegraf_pfinterface.php",
     "/usr/local/bin/telegraf_gateways.py",
      "/usr/local/bin/telegraf_pfifgw.php",
      "sh /usr/local/bin/telegraf_temperature.sh",
      "sh /usr/local/bin/telegraf_pinger_loss.sh"

stephenw10

Mmm, it seem like it must be the input.filestat call. What does that actually report? Can you comment it out to test?

Be-Bop-Bo

@stephenw10 - Roger that, I really do appreciate the help. I see no reason to have that in the config as I am not using it. It is not commented out. I will have to better look at the others to confirm I am using.

One more question, if I could: After these crashes I usually see push notifications of the reboot and Pushover web API notifications. So it has internet access for a while, then the device goes unreachable with this type of error.

arpresolve: can't allocate llinfo for x.x.x.x (WAN IP GW) on igc0

I have seen other posts, but I did not think I found a good resolution for the issue. I suppose have them stop crashing, but... Yeah, just thought I would ask.

Be-Bop-Bo

@stephenw10 OK I kept looking at these as I did have another crash but this time with clock. Looking down the my list I am seeing another using lsof:
[[inputs.netstat]]

https://github.com/influxdata/telegraf/tree/master/plugins/inputs/netstat

Will keep looking and see if I use these specific network collection. Network is my specific use-case, so will just have to try.

stephenw10

Those arpresolve errors are usually nothing to worry about. It's trying to create an arp entry for the gateway but no longer has an interface in that subnet because it lost the WAN. As soon as the WAN comes back up it clears. You should only ever see it temporarily when that happens.

Be-Bop-Bo

@stephenw10
I had another instance of a crash and reboot. It always seems to happen when my modem reboots, or maybe just when changes in state/connectivity of the WAN interface? Should I ask out on Telegraf's forum?

I would post more, but I am getting flagged as spam?

Be-Bop-Bo

@Be-Bop-Bo

Fatal trap 9: general protection fault while in kernel mode
cpuid = 0; apic id = 00
instruction pointer	= 0x20:0xffffffff80d4caa4
stack pointer	        = 0x28:0xfffffe0084131c00
frame pointer	        = 0x28:0xfffffe0084131c40
code segment		= base 0x0, limit 0xfffff, type 0x1b
			= DPL 0, pres 1, long 1, def32 0, gran 1
processor eflags	= resume, IOPL = 0
current process		= 2 (clock (0))

db:0:kdb.enter.default>  show pcpu
cpuid        = 0
dynamic pcpu = 0x111bf80
curthread    = 0xfffffe0011faa560: pid 2 tid 100041 critnest 1 "clock (0)"
curpcb       = 0xfffffe0011faaa80
fpcurthread  = none
idlethread   = 0xfffffe0011ee63a0: tid 100003 "idle: cpu0"
self         = 0xffffffff84010000
curpmap      = 0xffffffff83020ab0
tssp         = 0xffffffff84010384
rsp0         = 0xfffffe0084132000
kcr3         = 0xffffffffffffffff
ucr3         = 0xffffffffffffffff
scr3         = 0x0
gs32p        = 0xffffffff84010404
ldt          = 0xffffffff84010444
tss          = 0xffffffff84010434
curvnet      = 0xfffff800012004c0
db:0:kdb.enter.default>  bt
Tracing pid 2 tid 100041 td 0xfffffe0011faa560
kdb_enter() at kdb_enter+0x32/frame 0xfffffe0084131940
vpanic() at vpanic+0x163/frame 0xfffffe0084131a70
panic() at panic+0x43/frame 0xfffffe0084131ad0
trap_fatal() at trap_fatal+0x40c/frame 0xfffffe0084131b30
calltrap() at calltrap+0x8/frame 0xfffffe0084131b30
--- trap 0x9, rip = 0xffffffff80d4caa4, rsp = 0xfffffe0084131c00, rbp = 0xfffffe0084131c40 ---
turnstile_wait() at turnstile_wait+0x134/frame 0xfffffe0084131c40
__mtx_lock_sleep() at __mtx_lock_sleep+0x171/frame 0xfffffe0084131cd0
crfree() at crfree+0xaf/frame 0xfffffe0084131cf0
in_pcbfree() at in_pcbfree+0x280/frame 0xfffffe0084131d20
sorele_locked() at sorele_locked+0x89/frame 0xfffffe0084131d40
tcp_close() at tcp_close+0x159/frame 0xfffffe0084131d80
tcp_timer_2msl() at tcp_timer_2msl+0xf9/frame 0xfffffe0084131dd0
tcp_timer_enter() at tcp_timer_enter+0x101/frame 0xfffffe0084131e10
softclock_call_cc() at softclock_call_cc+0x134/frame 0xfffffe0084131ec0
softclock_thread() at softclock_thread+0xe9/frame 0xfffffe0084131ef0
fork_exit() at fork_exit+0x7f/frame 0xfffffe0084131f30
fork_trampoline() at fork_trampoline+0xe/frame 0xfffffe0084131f30
--- trap 0xa42be40b, rip = 0x8ba58ba52e552c55, rsp = 0xb48fb48f3dac3dac, rbp = 0x2bce2bca8e5e8e7e ---

stephenw10

Upvoted a bunch of your posts, you should be good to avoid the spam filters now.

That looks like a completely different crash though. What, if anything, has changed since the last one?

I've seen that one time before and it seemed to be openvpn related.

Be-Bop-Bo

@stephenw10
The change is the telegraf config file. I thought I saw some more stability in the package. When changing it over 3 other devices, some it caused that crash. I have had OpenVPN in the past, so it might linger in my config, but it is not currently installed as I moved over to WG exclusively.

stephenw10

Hmm, might need to wait for another crash and see if it's identical. The only previous time we've seen this it was a one time incidents and we never found a cause.