• Categories
  • Recent
  • Tags
  • Popular
  • Users
  • Search
  • Register
  • Login
Netgate Discussion Forum
  • Categories
  • Recent
  • Tags
  • Popular
  • Users
  • Search
  • Register
  • Login

Unexpected random crash & reboot

TNSR
2
5
1.4k
Loading More Posts
  • Oldest to Newest
  • Newest to Oldest
  • Most Votes
Reply
  • Reply as topic
Log in to reply
This topic has been deleted. Only users with topic management privileges can see it.
  • N
    NBhatti
    last edited by NBhatti Mar 31, 2022, 9:59 AM Mar 31, 2022, 9:57 AM

    Hi, I am running TNSR Version: 22.02-1~tnsr-v22.02-1 with Intel(R) Xeon(R) CPU E5-2450 0 @ 2.10GHz processor. 16GB of RAM with Intel Corporation 82599ES 10-Gigabit SFI/SFP+ network cards. Used with 10Gtek# SFP+ DAC Twinax Cable connected for Mikrotik CCS. I have 2 full BGP feeds being tested and traffic load is around 150 Mbps. Server is a Dell R420 machine.

    My configuration is simple and where

    dataplane cpu workers 4
    dataplane ethernet default-mtu 1500
    dataplane dpdk dev 0000:08:00.0 network name dag10g
    dataplane dpdk dev default network devargs disable_source_pruning=1
    dataplane dpdk dev default network tso off
    dataplane dpdk dev default network vlan-strip-offload off
    dataplane dpdk uio-driver igb_uio
    dataplane buffers buffers-per-numa 32768
    dataplane ip6 heap-size 8G
    dataplane statseg heap-size 1G
    dataplane statseg per-node-counters enable
    dataplane linux-cp nl-rx-buffer-size 1073741824
    

    and complete configuration is as follows.

    TNSR-1 tnsr# show configuration running cli
    configuration history enable
    
    nacm disable
    nacm read-default deny
    nacm write-default deny
    nacm exec-default deny
    nacm group admin
        member naseer
        member root
        member tnsr
    exit
    nacm rule-list admin-rules
        group admin
        rule permit-all
            module *
            access-operations *
            action permit
        exit
    exit
    nacm enable
    
    sysctl vm nr_hugepages 2048
    
    system name TNSR-1
    
    restconf
        enable true
        global authentication-type client-certificate
        global server-certificate restconf
        global server-key restconf
        global server-ca-cert-path selfca
    exit
    
    dataplane cpu workers 4
    dataplane ethernet default-mtu 1500
    dataplane dpdk dev 0000:08:00.0 network name dag10g
    dataplane dpdk dev default network devargs disable_source_pruning=1
    dataplane dpdk dev default network tso off
    dataplane dpdk dev default network vlan-strip-offload off
    dataplane dpdk uio-driver igb_uio
    dataplane buffers buffers-per-numa 32768
    dataplane ip6 heap-size 8G
    dataplane statseg heap-size 1G
    dataplane statseg per-node-counters enable
    dataplane linux-cp nl-rx-buffer-size 1073741824
    
    acl WAN_PROTECTING_ACL
        rule 100
            action deny
            ip-version ipv4
            destination port 179 179
            protocol tcp
        exit
        rule 1000
            action permit
            ip-version ipv4
        exit
    exit
    
    
    nat global-options nat44 max-translations-per-thread 128000
    nat global-options nat44 enabled false
    
    prometheus host enable
    prometheus host filter /buffer-pools
    prometheus host filter /if
    prometheus host filter /interface
    prometheus host filter /mem
    prometheus host filter /node
    prometheus host filter /sys
    
    route table ipv4-VRF:0
        id 0
    exit
    
    
    interface subif dag10g 843
        exact-match
        outer-dot1q 843
    exit
    interface subif dag10g 999
        exact-match
        outer-dot1q 999
    exit
    interface subif dag10g 1992
        exact-match
        outer-dot1q 1992
    exit
    interface subif dag10g 2110
        exact-match
        outer-dot1q 2110
    exit
    
    interface dag10g
        description UPLINK_TO_MIKROTIK_P9
        enable
        detailed-stats enable
    exit
    interface dag10g.1992
        description MPPL_VLAN_1992
        enable
        ip address 10.156.1.190/30
        access-list input acl WAN_PROTECTING_ACL sequence 100
        detailed-stats enable
    exit
    interface dag10g.2110
        description MPPL_VLAN_2110
        enable
        ip address 10.192.1.122/30
        access-list input acl WAN_PROTECTING_ACL sequence 100
        detailed-stats enable
    exit
    interface dag10g.843
        description VTPL_VLAN_843
        enable
        ip address 192.168.23.1/30
    exit
    interface dag10g.999
        description WAN_VLAN_999
        enable
        ip address 10.99.99.10/24
        detailed-stats enable
    exit
    
    nat ipfix logging domain 1
    nat ipfix logging src-port 4739
    nat nat64 map parameters
        security-check enable
    exit
    
    unbound server
        enable ip4
        enable tcp
        enable udp
        enable harden glue
        enable hide identity
        port outgoing range 4096
    exit
    
    snmp host enable
    snmp dataplane disable
    snmp community community-name yLinx source 10.2.20.52/32 security-name TNSRMonitor
    snmp group group-name ROGroup security-name TNSRMonitor security-model v2c
    snmp view view-name systemview view-type included oid .1
    snmp access group-name ROGroup prefix exact model any level noauth read systemview write none
    

    The system reboots unexpectedly and nothing needs to show in the logs indicating any core dump or anything before crash.

    Mar 27 20:17:37 TNSR-1 kernel: [259758.847960] device eno1 entered promiscuous mode
    Mar 27 20:21:35 TNSR-1 kernel: [259997.299997] device eno1 left promiscuous mode
    Mar 28 08:08:52 TNSR-1 kernel: [302434.142391] audit: type=1400 audit(1648454932.547:36): apparmor="DENIED" operation="open" profile="/usr/sbin/ntpd" name="/snap/bin/" pid=121039 comm="ntpd" requested_mask="r" denied_mask="r" fsu>
    Mar 31 07:11:07 TNSR-1 kernel: [    0.000000] microcode: microcode updated early to revision 0x71a, date = 2020-03-24
    Mar 31 07:11:07 TNSR-1 kernel: [    0.000000] Linux version 5.11.0-27-generic (buildd@lcy01-amd64-019) (gcc (Ubuntu 9.3.0-17ubuntu1~20.04) 9.3.0, GNU ld (GNU Binutils for Ubuntu) 2.34) #29~20.04.1-Ubuntu SMP Wed Aug 11 15:58:17 U>
    Mar 31 07:11:07 TNSR-1 kernel: [    0.000000] Command line: BOOT_IMAGE=/vmlinuz-5.11.0-27-generic root=/dev/mapper/ubuntu--vg-ubuntu--lv ro intel_iommu=on iommu=pt console=ttyS0,115200n8 console=tty0 maybe-ubiquity
    

    Syslog also shows

    Mar 31 07:06:22 TNSR-1 vpp_prometheus_export[105457]: Client address is [::ffff:10.2.20.62]:43212
    Mar 31 07:06:37 TNSR-1 vpp_prometheus_export[105457]: Client address is [::ffff:10.2.20.62]:43216
    Mar 31 07:11:07 TNSR-1 systemd-modules-load[705]: Inserted module 'uio'
    Mar 31 07:11:07 TNSR-1 systemd-modules-load[705]: Inserted module 'igb_uio'
    Mar 31 07:11:07 TNSR-1 systemd-modules-load[705]: Inserted module 'msr'
    Mar 31 07:11:07 TNSR-1 lvm[699]:   1 logical volume(s) in volume group "ubuntu-vg" monitored
    Mar 31 07:11:07 TNSR-1 systemd-modules-load[705]: Module 'vfio_pci' is built in
    Mar 31 07:11:07 TNSR-1 systemd[1]: Mounted Huge Pages File System.
    Mar 31 07:11:07 TNSR-1 kernel: [    0.000000] microcode: microcode updated early to revision 0x71a, date = 2020-03-24
    Mar 31 07:11:07 TNSR-1 systemd[1]: Mounted POSIX Message Queue File System.
    

    I had the same behavior when I had the following settings

    dataplane dpdk dev default network tso on
    dataplane dpdk dev default network vlan-strip-offload on
    

    dataplane would crash randomly so I set them to off. But again having the same unexpected reboot. Any ideas where to look for and what could be the reason for this reboots?

    Thanks.

    D 1 Reply Last reply Apr 1, 2022, 12:40 PM Reply Quote 0
    • D
      Derelict LAYER 8 Netgate @NBhatti
      last edited by Apr 1, 2022, 12:40 PM

      @NBhatti

      What are you running it on?

      Why do you need 4 workers? I would set that to 1 and see if it stabilizes.

      Chattanooga, Tennessee, USA
      A comprehensive network diagram is worth 10,000 words and 15 conference calls.
      DO NOT set a source address/port in a port forward or firewall rule unless you KNOW you need it!
      Do Not Chat For Help! NO_WAN_EGRESS(TM)

      N 1 Reply Last reply Apr 1, 2022, 9:19 PM Reply Quote 0
      • N
        NBhatti @Derelict
        last edited by Apr 1, 2022, 9:19 PM

        @derelict said in Unexpected random crash & reboot:

        @NBhatti

        What are you running it on?

        Why do you need 4 workers? I would set that to 1 and see if it stabilizes.

        It's a Dell R420 server machine with Ubuntu. No specific reason for 4 workers, just to get maximize performance since i want to test it with 2 x 10G uplinks. It could be the 100% CPU core usage may contribute to overheat hence the reboot but I can't seem to find anything in reboot logs or iDRAC System event logs either.

        D 1 Reply Last reply Apr 3, 2022, 3:18 PM Reply Quote 0
        • D
          Derelict LAYER 8 Netgate @NBhatti
          last edited by Apr 3, 2022, 3:18 PM

          @nbhatti What do you mean "Dell R420 server machine with Ubuntu"?

          The ubuntu that is installed by the tnsr installer or something else?

          I would absolutely go down to 1 worker and see if it stabilizes. You do not need 4 workers to saturate 2 x 10G links. 1 should be sufficient.

          Chattanooga, Tennessee, USA
          A comprehensive network diagram is worth 10,000 words and 15 conference calls.
          DO NOT set a source address/port in a port forward or firewall rule unless you KNOW you need it!
          Do Not Chat For Help! NO_WAN_EGRESS(TM)

          N 1 Reply Last reply Apr 3, 2022, 4:34 PM Reply Quote 0
          • N
            NBhatti @Derelict
            last edited by Apr 3, 2022, 4:34 PM

            @derelict When you said what are you running it on, I thought you were asking for the hardware :)

            Anyway, I have turned down the config to just 1 worker. 4 were working fine since last reboot, but since a single core should also service the purpose, I'll put that to 1 worker to see if that works fine.

            Thanks for the help.

            1 Reply Last reply Reply Quote 0
            4 out of 5
            • First post
              4/5
              Last post
            Copyright 2025 Rubicon Communications LLC (Netgate). All rights reserved.