-----BEGIN PGP SIGNED MESSAGE----- Hash: SHA1 We're running SLES 8 on a Tyan 2800 BIOS 2.01l with 2x Opteron 244, 6GB RAM. Linux db64 2.4.19-SMP #1 SMP Mon Mar 31 23:48:08 UTC 2003 x86_64 unknown Last time I tried to upgrade the kernel the new kernel didn't like something about the zero channel raid and wouldn't mount the root disk. :(, same with the SLES Service pack 2, so we started from scratch install, no extra patches. we've got a couple of issues... 1) Linux won't boot with the DRAM ECC enabled in the BIOS.. so we're running w/o ECC. 2) I ran a "memory eater" script, that ate up all the memory, the machine didn't swap.. it crashed... spit out some errors... below is the script, top output right after the crash, and the error messages. Any ideas? bad ram? Kris. - --- snip --- export PARALLEL=1 #!/bin/bash2 # # memtest.sh # # Shell script to help isolate memory failures under linux # # Author: Doug Ledford + contributors # # (C) Copyright 2000-2002 Doug Ledford; Red Hat, Inc. # This shell script is released under the terms of the GNU General # Public License Version 2, June 1991. If you do not have a copy # of the GNU General Public License Version 2, then one may be # retrieved from http://people.redhat.com/dledford/GPL.html # # Note, this needs bash2 for the wait command support. # This is where we will run the tests at if [ -z "$TEST_DIR" ]; then ~ TEST_DIR=/usr/zap/tmp fi # The location of the linux kernel source file we will be using if [ -z "$SOURCE_FILE" ]; then ~ SOURCE_FILE=$TEST_DIR/linux.tar.gz fi if [ ! -f "$SOURCE_FILE" ]; then ~ echo "Missing source file $SOURCE_FILE" ~ exit 1 fi # How many passes to run of this test, higher numbers are better if [ -z "$NR_PASSES" ]; then ~ NR_PASSES=40 fi # Guess how many megs the unpacked archive is if [ -z "$MEG_PER_COPY" ]; then ~ MEG_PER_COPY=$(ls -l $SOURCE_FILE | awk '{print int($5/1024/1024) * 4}') fi # How many trees do we have to unpack in order to make our trees be larger # than physical RAM? If we don't unpack more data than memory can hold # before we start to run the diff program on the trees then we won't # actually flush the data to disk and force the system to reread the data # from disk. Instead, the system will do everything in RAM. That doesn't # work (as far as the memory test is concerned). It's the simultaneous # unpacking of data in memory and the read/writes to hard disk via DMA that # breaks the memory subsystem in most cases. Doing everything in RAM without # causing disk I/O will pass bad memory far more often than when you add # in the disk I/O. if [ -z "$NR_SIMULTANEOUS" ]; then ~ NR_SIMULTANEOUS=$(free | awk -v meg_per_copy=$MEG_PER_COPY 'NR == 2 {print int($2*1.5/1024/meg_per_ copy + (($2/1024)%meg_per_copy >= (meg_per_copy/2)) + (($2/1024/32) < 1))}') fi # Should we unpack/diff the $NR_SIMULTANEOUS trees in series or in parallel? if [ ! -z "$PARALLEL" ]; then ~ PARALLEL="yes" else ~ PARALLEL="no" fi if [ ! -z "$JUST_INFO" ]; then ~ echo "TEST_DIR: $TEST_DIR" ~ echo "SOURCE_FILE: $SOURCE_FILE" ~ echo "NR_PASSES: $NR_PASSES" ~ echo "MEG_PER_COPY: $MEG_PER_COPY" ~ echo "NR_SIMULTANEOUS: $NR_SIMULTANEOUS" ~ echo "PARALLEL: $PARALLEL" ~ echo ~ exit fi cd $TEST_DIR # Remove any possible left over directories from a cancelled previous run rm -fr linux linux.orig linux.pass.* # Unpack the one copy of the source tree that we will be comparing against tar -xzf $SOURCE_FILE mv linux linux.orig i=0 while [ "$i" -lt "$NR_PASSES" ]; do ~ j=0 ~ while [ "$j" -lt "$NR_SIMULTANEOUS" ]; do ~ if [ $PARALLEL = "yes" ]; then ~ (mkdir $j; tar -xzf $SOURCE_FILE -C $j; mv $j/linux linux.pass.$j; rmdir $j) & ~ else ~ tar -xzf $SOURCE_FILE ~ mv linux linux.pass.$j ~ fi ~ j=`expr $j + 1` ~ done ~ wait ~ j=0 ~ while [ "$j" -lt "$NR_SIMULTANEOUS" ]; do ~ if [ $PARALLEL = "yes" ]; then ~ (mkdir $j; tar -xzf $SOURCE_FILE -C $j; mv $j/linux linux.pass.$j; rmdir $j) & ~ else ~ tar -xzf $SOURCE_FILE ~ mv linux linux.pass.$j ~ fi ~ j=`expr $j + 1` ~ done ~ wait ~ j=0 ~ while [ "$j" -lt "$NR_SIMULTANEOUS" ]; do ~ if [ $PARALLEL = "yes" ]; then ~ (diff -U 3 -rN linux.orig linux.pass.$j; rm -fr linux.pass.$j) & ~ else ~ diff -U 3 -rN linux.orig linux.pass.$j ~ rm -fr linux.pass.$j ~ fi ~ j=`expr $j + 1` ~ done ~ wait ~ i=`expr $i + 1` done # Clean up after ourselves rm -fr linux linux.orig linux.pass.* ~ 8:50pm up 3:06, 2 users, load average: 54.12, 51.66, 33.69 207 processes: 206 sleeping, 1 running, 0 zombie, 0 stopped CPU0 states: 0.1% user, 0.0% system, 0.0% nice, 99.4% idle CPU1 states: 0.0% user, 0.0% system, 0.0% nice, 100.0% idle Mem: 5806792K av, 5791816K used, 14976K free, 0K shrd, 437508K buff Swap: 1052248K av, 0K used, 1052248K free 4640676K cached ~ PID USER PRI NI SIZE RSS SHARE STAT %CPU %MEM TIME COMMAND ~ 1738 zappos 15 0 1260 1260 844 R 0.1 0.0 0:05 top ~ 1 root 15 0 240 240 188 S 0.0 0.0 0:04 init ~ 2 root 0K 0 0 0 0 SW 0.0 0.0 0:00 migration_CPU0 ~ 3 root 0K 0 0 0 0 SW 0.0 0.0 0:00 migration_CPU1 ~ 4 root 15 0 0 0 0 SW 0.0 0.0 0:00 keventd ~ 5 root 34 19 0 0 0 SWN 0.0 0.0 0:00 ksoftirqd_CPU0 ~ 6 root 34 19 0 0 0 SWN 0.0 0.0 0:00 ksoftirqd_CPU1 ~ 7 root 15 0 0 0 0 SW 0.0 0.0 0:00 kswapd ~ 8 root 25 0 0 0 0 SW 0.0 0.0 0:00 bdflush ~ 9 root 15 0 0 0 0 SW 0.0 0.0 0:00 kupdated ~ 10 root 25 0 0 0 0 SW 0.0 0.0 0:00 kinoded ~ 12 root 25 0 0 0 0 SW 0.0 0.0 0:00 mdrecoveryd ~ 16 root 15 0 0 0 0 SW 0.0 0.0 0:00 kreiserfsd ~ 73 root 0 -20 0 0 0 SW< 0.0 0.0 0:00 lvm-mpd ~ 420 root 15 0 660 660 524 S 0.0 0.0 0:00 syslogd ~ 423 root 15 0 1376 1376 444 S 0.0 0.0 0:00 klogd ~ 459 root 24 0 0 0 0 SW 0.0 0.0 0:00 khubd ~ 612 bin 25 0 460 460 360 S 0.0 0.0 0:00 portmap ~ 634 root 23 0 2860 2860 1488 S 0.0 0.0 0:00 snmpd ~ 678 root 15 0 1904 1904 1736 S 0.0 0.0 0:00 sshd ~ 876 root 15 0 1828 1828 1388 S 0.0 0.0 0:00 master ~ 885 postfix 15 0 2020 2020 1532 S 0.0 0.0 0:00 qmgr ~ 900 at 16 0 608 608 488 S 0.0 0.0 0:00 atd ~ 915 root 15 0 696 696 552 S 0.0 0.0 0:00 cron ~ 1012 root 15 0 800 800 620 S 0.0 0.0 0:00 nscd ~ 1013 root 15 0 800 800 620 S 0.0 0.0 0:00 nscd ~ 1014 root 15 0 800 800 620 S 0.0 0.0 0:00 nscd ~ 1015 root 15 0 800 800 620 S 0.0 0.0 0:00 nscd ~ 1016 root 15 0 800 800 620 S 0.0 0.0 0:00 nscd ~ 1017 root 15 0 800 800 620 S 0.0 0.0 0:00 nscd ~ 1018 root 15 0 800 800 620 S 0.0 0.0 0:00 nscd ~ 1031 root 20 0 512 512 428 S 0.0 0.0 0:00 mingetty ~ 1032 root 20 0 512 512 428 S 0.0 0.0 0:00 mingetty ~ 1033 root 21 0 512 512 428 S 0.0 0.0 0:00 mingetty ~ 1034 root 20 0 512 512 428 S 0.0 0.0 0:00 mingetty ~ 1035 root 21 0 512 512 428 S 0.0 0.0 0:00 mingetty ~ 1036 root 21 0 512 512 428 S 0.0 0.0 0:00 mingetty ~ 1320 postfix 15 0 1724 1724 1312 S 0.0 0.0 0:00 pickup ~ 1366 root 15 0 2548 2548 2368 S 0.0 0.0 0:00 sshd ~ 1368 zappos 15 0 2652 2652 2424 S 0.0 0.0 0:00 sshd Message from syslogd@db64 at Thu Oct 16 20:49:20 2003 ... db64 kernel: MCG_STATUS: unrecoverable memtest.bash: line 108: 1627 Segmentation fault tar -xzf $SOURCE_FILE -C $j Message from syslogd@db64 at Thu Oct 16 20:49:22 2003 ... db64 kernel: Northbridge Machine Check exception b40000000005001b 0 Message from syslogd@db64 at Thu Oct 16 20:49:22 2003 ... db64 kernel: Uncorrectable condition Message from syslogd@db64 at Thu Oct 16 20:49:22 2003 ... db64 kernel: Unrecoverable condition Message from syslogd@db64 at Thu Oct 16 20:49:22 2003 ... db64 kernel: Error uncorrected Message from syslogd@db64 at Thu Oct 16 20:49:22 2003 ... db64 kernel: Address: 0000000009470000 Message from syslogd@db64 at Thu Oct 16 20:49:22 2003 ... db64 kernel: MCE at EIP ffffffffa001412e ESP 10025659d98 Message from syslogd@db64 at Thu Oct 16 20:49:22 2003 ... db64 kernel: CPU 0: Machine Check Exception: 0000000000000000 Message from syslogd@db64 at Thu Oct 16 20:49:22 2003 ... db64 kernel: Kernel panic: Unable to continue Message from syslogd@db64 at Thu Oct 16 20:49:22 2003 ... db64 kernel: Kernel BUG at journal:3092 Message from syslogd@db64 at Thu Oct 16 20:49:22 2003 ... db64 kernel: invalid operand: 0000 -----BEGIN PGP SIGNATURE----- Version: GnuPG v1.2.3 (MingW32) Comment: Using GnuPG with Thunderbird - http://enigmail.mozdev.org iD8DBQE/j25WuBLfyXibQuYRAlCmAJwK7J81+donOr3xnJwW5EUfiDSZmACePLHC jGzmj8K//nK7Fi7275meFLs= =Dx+U -----END PGP SIGNATURE-----