# HG changeset patch # User Albert Lee # Date 1271701541 14400 # Node ID fd074940082cc18bbeae34ee25d193141918c186 Import of existing Perl implementation. diff -r 000000000000 -r fd074940082c perl/Intrs.xs --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/perl/Intrs.xs Mon Apr 19 14:25:41 2010 -0400 @@ -0,0 +1,109 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +/* Non-shipping header - see Makefile.PL */ +#include + +#include "EXTERN.h" +#include "perl.h" +#include "XSUB.h" + +static int +open_dev(char *path) +{ + char intrpath[MAXPATHLEN]; + + (void) strcpy(intrpath, "/devices"); + (void) strcat(intrpath, path); + (void) strcat(intrpath, ":intr"); + return (open(intrpath, O_RDWR)); +} + +MODULE = Sun::Solaris::Intrs PACKAGE = Sun::Solaris::Intrs +PROTOTYPES: ENABLE + +int +intrmove(path, ino, cpu, num_ino) + char *path + int ino + int cpu + int num_ino + INIT: + int fd, ret; + pcitool_intr_set_t iset; + + CODE: + if ((fd = open_dev(path)) == -1) { + XSRETURN_UNDEF; + } + iset.ino = ino; + iset.cpu_id = cpu; + iset.flags = (num_ino > 1) ? PCITOOL_INTR_FLAG_SET_GROUP : 0; + iset.user_version = PCITOOL_VERSION; + + ret = ioctl(fd, PCITOOL_DEVICE_SET_INTR, &iset); + + if (ret == -1) { + XSRETURN_UNDEF; + } + (void) close(fd); + XSRETURN_YES; + +int +is_pcplusmp(path) + char *path + + INIT: + int fd, ret; + pcitool_intr_info_t iinfo; + + CODE: + if ((fd = open_dev(path)) == -1) { + XSRETURN_UNDEF; + } + iinfo.user_version = PCITOOL_VERSION; + + ret = ioctl(fd, PCITOOL_SYSTEM_INTR_INFO, &iinfo); + (void) close(fd); + + if (ret == -1) { + XSRETURN_UNDEF; + } + + if (iinfo.ctlr_type == PCITOOL_CTLR_TYPE_PCPLUSMP) { + XSRETURN_YES; + } + + XSRETURN_NO; diff -r 000000000000 -r fd074940082c perl/Kstat.xs --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/perl/Kstat.xs Mon Apr 19 14:25:41 2010 -0400 @@ -0,0 +1,1715 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * Kstat.xs is a Perl XS (eXStension module) that makes the Solaris + * kstat(3KSTAT) facility available to Perl scripts. Kstat is a general-purpose + * mechanism for providing kernel statistics to users. The Solaris API is + * function-based (see the manpage for details), but for ease of use in Perl + * scripts this module presents the information as a nested hash data structure. + * It would be too inefficient to read every kstat in the system, so this module + * uses the Perl TIEHASH mechanism to implement a read-on-demand semantic, which + * only reads and updates kstats as and when they are actually accessed. + */ + +/* + * Ignored raw kstats. + * + * Some raw kstats are ignored by this module, these are listed below. The + * most common reason is that the kstats are stored as arrays and the ks_ndata + * and/or ks_data_size fields are invalid. In this case it is impossible to + * know how many records are in the array, so they can't be read. + * + * unix:*:sfmmu_percpu_stat + * This is stored as an array with one entry per cpu. Each element is of type + * struct sfmmu_percpu_stat. The ks_ndata and ks_data_size fields are bogus. + * + * ufs directio:*:UFS DirectIO Stats + * The structure definition used for these kstats (ufs_directio_kstats) is in a + * C file (uts/common/fs/ufs/ufs_directio.c) rather than a header file, so it + * isn't accessible. + * + * qlc:*:statistics + * This is a third-party driver for which we don't have source. + * + * mm:*:phys_installed + * This is stored as an array of uint64_t, with each pair of values being the + * (address, size) of a memory segment. The ks_ndata and ks_data_size fields + * are both zero. + * + * sockfs:*:sock_unix_list + * This is stored as an array with one entry per active socket. Each element + * is of type struct k_sockinfo. The ks_ndata and ks_data_size fields are both + * zero. + * + * Note that the ks_ndata and ks_data_size of many non-array raw kstats are + * also incorrect. The relevant assertions are therefore commented out in the + * appropriate raw kstat read routines. + */ + +/* Kstat related includes */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Ultra-specific kstat includes */ +#ifdef __sparc +#include /* from /usr/platform/sun4u/include */ +#include /* from /usr/platform/sun4u/include */ +#include /* from /usr/platform/sun4u/include */ +#include /* from /usr/include */ +#endif + +/* + * Solaris #defines SP, which conflicts with the perl definition of SP + * We don't need the Solaris one, so get rid of it to avoid warnings + */ +#undef SP + +/* Perl XS includes */ +#include "EXTERN.h" +#include "perl.h" +#include "XSUB.h" + +/* Debug macros */ +#define DEBUG_ID "Sun::Solaris::Kstat" +#ifdef KSTAT_DEBUG +#define PERL_ASSERT(EXP) \ + ((void)((EXP) || (croak("%s: assertion failed at %s:%d: %s", \ + DEBUG_ID, __FILE__, __LINE__, #EXP), 0), 0)) +#define PERL_ASSERTMSG(EXP, MSG) \ + ((void)((EXP) || (croak(DEBUG_ID ": " MSG), 0), 0)) +#else +#define PERL_ASSERT(EXP) ((void)0) +#define PERL_ASSERTMSG(EXP, MSG) ((void)0) +#endif + +/* Macros for saving the contents of KSTAT_RAW structures */ +#if defined(HAS_QUAD) && defined(USE_64_BIT_INT) +#define NEW_IV(V) \ + (newSViv((IVTYPE) V)) +#define NEW_UV(V) \ + (newSVuv((UVTYPE) V)) +#else +#define NEW_IV(V) \ + (V >= IV_MIN && V <= IV_MAX ? newSViv((IVTYPE) V) : newSVnv((NVTYPE) V)) +#if defined(UVTYPE) +#define NEW_UV(V) \ + (V <= UV_MAX ? newSVuv((UVTYPE) V) : newSVnv((NVTYPE) V)) +# else +#define NEW_UV(V) \ + (V <= IV_MAX ? newSViv((IVTYPE) V) : newSVnv((NVTYPE) V)) +#endif +#endif +#define NEW_HRTIME(V) \ + newSVnv((NVTYPE) (V / 1000000000.0)) + +#define SAVE_FNP(H, F, K) \ + hv_store(H, K, sizeof (K) - 1, newSViv((IVTYPE) &F), 0) +#define SAVE_STRING(H, S, K, SS) \ + hv_store(H, #K, sizeof (#K) - 1, \ + newSVpvn(S->K, SS ? strlen(S->K) : sizeof(S->K)), 0) +#define SAVE_INT32(H, S, K) \ + hv_store(H, #K, sizeof (#K) - 1, NEW_IV(S->K), 0) +#define SAVE_UINT32(H, S, K) \ + hv_store(H, #K, sizeof (#K) - 1, NEW_UV(S->K), 0) +#define SAVE_INT64(H, S, K) \ + hv_store(H, #K, sizeof (#K) - 1, NEW_IV(S->K), 0) +#define SAVE_UINT64(H, S, K) \ + hv_store(H, #K, sizeof (#K) - 1, NEW_UV(S->K), 0) +#define SAVE_HRTIME(H, S, K) \ + hv_store(H, #K, sizeof (#K) - 1, NEW_HRTIME(S->K), 0) + +/* Private structure used for saving kstat info in the tied hashes */ +typedef struct { + char read; /* Kstat block has been read before */ + char valid; /* Kstat still exists in kstat chain */ + char strip_str; /* Strip KSTAT_DATA_CHAR fields */ + kstat_ctl_t *kstat_ctl; /* Handle returned by kstat_open */ + kstat_t *kstat; /* Handle used by kstat_read */ +} KstatInfo_t; + +/* typedef for apply_to_ties callback functions */ +typedef int (*ATTCb_t)(HV *, void *); + +/* typedef for raw kstat reader functions */ +typedef void (*kstat_raw_reader_t)(HV *, kstat_t *, int); + +/* Hash of "module:name" to KSTAT_RAW read functions */ +static HV *raw_kstat_lookup; + +/* + * Kstats come in two flavours, named and raw. Raw kstats are just C structs, + * so we need a function per raw kstat to convert the C struct into the + * corresponding perl hash. All such conversion functions are in the following + * section. + */ + +/* + * Definitions in /usr/include/sys/cpuvar.h and /usr/include/sys/sysinfo.h + */ + +static void +save_cpu_stat(HV *self, kstat_t *kp, int strip_str) +{ + cpu_stat_t *statp; + cpu_sysinfo_t *sysinfop; + cpu_syswait_t *syswaitp; + cpu_vminfo_t *vminfop; + + /* PERL_ASSERT(kp->ks_ndata == 1); */ + PERL_ASSERT(kp->ks_data_size == sizeof (cpu_stat_t)); + statp = (cpu_stat_t *)(kp->ks_data); + sysinfop = &statp->cpu_sysinfo; + syswaitp = &statp->cpu_syswait; + vminfop = &statp->cpu_vminfo; + + hv_store(self, "idle", 4, NEW_UV(sysinfop->cpu[CPU_IDLE]), 0); + hv_store(self, "user", 4, NEW_UV(sysinfop->cpu[CPU_USER]), 0); + hv_store(self, "kernel", 6, NEW_UV(sysinfop->cpu[CPU_KERNEL]), 0); + hv_store(self, "wait", 4, NEW_UV(sysinfop->cpu[CPU_WAIT]), 0); + hv_store(self, "wait_io", 7, NEW_UV(sysinfop->wait[W_IO]), 0); + hv_store(self, "wait_swap", 9, NEW_UV(sysinfop->wait[W_SWAP]), 0); + hv_store(self, "wait_pio", 8, NEW_UV(sysinfop->wait[W_PIO]), 0); + SAVE_UINT32(self, sysinfop, bread); + SAVE_UINT32(self, sysinfop, bwrite); + SAVE_UINT32(self, sysinfop, lread); + SAVE_UINT32(self, sysinfop, lwrite); + SAVE_UINT32(self, sysinfop, phread); + SAVE_UINT32(self, sysinfop, phwrite); + SAVE_UINT32(self, sysinfop, pswitch); + SAVE_UINT32(self, sysinfop, trap); + SAVE_UINT32(self, sysinfop, intr); + SAVE_UINT32(self, sysinfop, syscall); + SAVE_UINT32(self, sysinfop, sysread); + SAVE_UINT32(self, sysinfop, syswrite); + SAVE_UINT32(self, sysinfop, sysfork); + SAVE_UINT32(self, sysinfop, sysvfork); + SAVE_UINT32(self, sysinfop, sysexec); + SAVE_UINT32(self, sysinfop, readch); + SAVE_UINT32(self, sysinfop, writech); + SAVE_UINT32(self, sysinfop, rcvint); + SAVE_UINT32(self, sysinfop, xmtint); + SAVE_UINT32(self, sysinfop, mdmint); + SAVE_UINT32(self, sysinfop, rawch); + SAVE_UINT32(self, sysinfop, canch); + SAVE_UINT32(self, sysinfop, outch); + SAVE_UINT32(self, sysinfop, msg); + SAVE_UINT32(self, sysinfop, sema); + SAVE_UINT32(self, sysinfop, namei); + SAVE_UINT32(self, sysinfop, ufsiget); + SAVE_UINT32(self, sysinfop, ufsdirblk); + SAVE_UINT32(self, sysinfop, ufsipage); + SAVE_UINT32(self, sysinfop, ufsinopage); + SAVE_UINT32(self, sysinfop, inodeovf); + SAVE_UINT32(self, sysinfop, fileovf); + SAVE_UINT32(self, sysinfop, procovf); + SAVE_UINT32(self, sysinfop, intrthread); + SAVE_UINT32(self, sysinfop, intrblk); + SAVE_UINT32(self, sysinfop, idlethread); + SAVE_UINT32(self, sysinfop, inv_swtch); + SAVE_UINT32(self, sysinfop, nthreads); + SAVE_UINT32(self, sysinfop, cpumigrate); + SAVE_UINT32(self, sysinfop, xcalls); + SAVE_UINT32(self, sysinfop, mutex_adenters); + SAVE_UINT32(self, sysinfop, rw_rdfails); + SAVE_UINT32(self, sysinfop, rw_wrfails); + SAVE_UINT32(self, sysinfop, modload); + SAVE_UINT32(self, sysinfop, modunload); + SAVE_UINT32(self, sysinfop, bawrite); +#ifdef STATISTICS /* see header file */ + SAVE_UINT32(self, sysinfop, rw_enters); + SAVE_UINT32(self, sysinfop, win_uo_cnt); + SAVE_UINT32(self, sysinfop, win_uu_cnt); + SAVE_UINT32(self, sysinfop, win_so_cnt); + SAVE_UINT32(self, sysinfop, win_su_cnt); + SAVE_UINT32(self, sysinfop, win_suo_cnt); +#endif + + SAVE_INT32(self, syswaitp, iowait); + SAVE_INT32(self, syswaitp, swap); + SAVE_INT32(self, syswaitp, physio); + + SAVE_UINT32(self, vminfop, pgrec); + SAVE_UINT32(self, vminfop, pgfrec); + SAVE_UINT32(self, vminfop, pgin); + SAVE_UINT32(self, vminfop, pgpgin); + SAVE_UINT32(self, vminfop, pgout); + SAVE_UINT32(self, vminfop, pgpgout); + SAVE_UINT32(self, vminfop, swapin); + SAVE_UINT32(self, vminfop, pgswapin); + SAVE_UINT32(self, vminfop, swapout); + SAVE_UINT32(self, vminfop, pgswapout); + SAVE_UINT32(self, vminfop, zfod); + SAVE_UINT32(self, vminfop, dfree); + SAVE_UINT32(self, vminfop, scan); + SAVE_UINT32(self, vminfop, rev); + SAVE_UINT32(self, vminfop, hat_fault); + SAVE_UINT32(self, vminfop, as_fault); + SAVE_UINT32(self, vminfop, maj_fault); + SAVE_UINT32(self, vminfop, cow_fault); + SAVE_UINT32(self, vminfop, prot_fault); + SAVE_UINT32(self, vminfop, softlock); + SAVE_UINT32(self, vminfop, kernel_asflt); + SAVE_UINT32(self, vminfop, pgrrun); + SAVE_UINT32(self, vminfop, execpgin); + SAVE_UINT32(self, vminfop, execpgout); + SAVE_UINT32(self, vminfop, execfree); + SAVE_UINT32(self, vminfop, anonpgin); + SAVE_UINT32(self, vminfop, anonpgout); + SAVE_UINT32(self, vminfop, anonfree); + SAVE_UINT32(self, vminfop, fspgin); + SAVE_UINT32(self, vminfop, fspgout); + SAVE_UINT32(self, vminfop, fsfree); +} + +/* + * Definitions in /usr/include/sys/var.h + */ + +static void +save_var(HV *self, kstat_t *kp, int strip_str) +{ + struct var *varp; + + /* PERL_ASSERT(kp->ks_ndata == 1); */ + PERL_ASSERT(kp->ks_data_size == sizeof (struct var)); + varp = (struct var *)(kp->ks_data); + + SAVE_INT32(self, varp, v_buf); + SAVE_INT32(self, varp, v_call); + SAVE_INT32(self, varp, v_proc); + SAVE_INT32(self, varp, v_maxupttl); + SAVE_INT32(self, varp, v_nglobpris); + SAVE_INT32(self, varp, v_maxsyspri); + SAVE_INT32(self, varp, v_clist); + SAVE_INT32(self, varp, v_maxup); + SAVE_INT32(self, varp, v_hbuf); + SAVE_INT32(self, varp, v_hmask); + SAVE_INT32(self, varp, v_pbuf); + SAVE_INT32(self, varp, v_sptmap); + SAVE_INT32(self, varp, v_maxpmem); + SAVE_INT32(self, varp, v_autoup); + SAVE_INT32(self, varp, v_bufhwm); +} + +/* + * Definition in /usr/include/sys/vmmeter.h + */ + +static void +save_flushmeter(HV *self, kstat_t *kp, int strip_str) +{ + struct flushmeter *flushmeterp; + + /* PERL_ASSERT(kp->ks_ndata == 1); */ + PERL_ASSERT(kp->ks_data_size == sizeof (struct flushmeter)); + flushmeterp = (struct flushmeter *)(kp->ks_data); + + SAVE_UINT32(self, flushmeterp, f_ctx); + SAVE_UINT32(self, flushmeterp, f_segment); + SAVE_UINT32(self, flushmeterp, f_page); + SAVE_UINT32(self, flushmeterp, f_partial); + SAVE_UINT32(self, flushmeterp, f_usr); + SAVE_UINT32(self, flushmeterp, f_region); +} + +/* + * Definition in /usr/include/sys/dnlc.h + */ + +static void +save_ncstats(HV *self, kstat_t *kp, int strip_str) +{ + struct ncstats *ncstatsp; + + /* PERL_ASSERT(kp->ks_ndata == 1); */ + PERL_ASSERT(kp->ks_data_size == sizeof (struct ncstats)); + ncstatsp = (struct ncstats *)(kp->ks_data); + + SAVE_INT32(self, ncstatsp, hits); + SAVE_INT32(self, ncstatsp, misses); + SAVE_INT32(self, ncstatsp, enters); + SAVE_INT32(self, ncstatsp, dbl_enters); + SAVE_INT32(self, ncstatsp, long_enter); + SAVE_INT32(self, ncstatsp, long_look); + SAVE_INT32(self, ncstatsp, move_to_front); + SAVE_INT32(self, ncstatsp, purges); +} + +/* + * Definition in /usr/include/sys/sysinfo.h + */ + +static void +save_sysinfo(HV *self, kstat_t *kp, int strip_str) +{ + sysinfo_t *sysinfop; + + /* PERL_ASSERT(kp->ks_ndata == 1); */ + PERL_ASSERT(kp->ks_data_size == sizeof (sysinfo_t)); + sysinfop = (sysinfo_t *)(kp->ks_data); + + SAVE_UINT32(self, sysinfop, updates); + SAVE_UINT32(self, sysinfop, runque); + SAVE_UINT32(self, sysinfop, runocc); + SAVE_UINT32(self, sysinfop, swpque); + SAVE_UINT32(self, sysinfop, swpocc); + SAVE_UINT32(self, sysinfop, waiting); +} + +/* + * Definition in /usr/include/sys/sysinfo.h + */ + +static void +save_vminfo(HV *self, kstat_t *kp, int strip_str) +{ + vminfo_t *vminfop; + + /* PERL_ASSERT(kp->ks_ndata == 1); */ + PERL_ASSERT(kp->ks_data_size == sizeof (vminfo_t)); + vminfop = (vminfo_t *)(kp->ks_data); + + SAVE_UINT64(self, vminfop, freemem); + SAVE_UINT64(self, vminfop, swap_resv); + SAVE_UINT64(self, vminfop, swap_alloc); + SAVE_UINT64(self, vminfop, swap_avail); + SAVE_UINT64(self, vminfop, swap_free); +} + +/* + * Definition in /usr/include/nfs/nfs_clnt.h + */ + +static void +save_nfs(HV *self, kstat_t *kp, int strip_str) +{ + struct mntinfo_kstat *mntinfop; + + /* PERL_ASSERT(kp->ks_ndata == 1); */ + PERL_ASSERT(kp->ks_data_size == sizeof (struct mntinfo_kstat)); + mntinfop = (struct mntinfo_kstat *)(kp->ks_data); + + SAVE_STRING(self, mntinfop, mik_proto, strip_str); + SAVE_UINT32(self, mntinfop, mik_vers); + SAVE_UINT32(self, mntinfop, mik_flags); + SAVE_UINT32(self, mntinfop, mik_secmod); + SAVE_UINT32(self, mntinfop, mik_curread); + SAVE_UINT32(self, mntinfop, mik_curwrite); + SAVE_INT32(self, mntinfop, mik_timeo); + SAVE_INT32(self, mntinfop, mik_retrans); + SAVE_UINT32(self, mntinfop, mik_acregmin); + SAVE_UINT32(self, mntinfop, mik_acregmax); + SAVE_UINT32(self, mntinfop, mik_acdirmin); + SAVE_UINT32(self, mntinfop, mik_acdirmax); + hv_store(self, "lookup_srtt", 11, + NEW_UV(mntinfop->mik_timers[0].srtt), 0); + hv_store(self, "lookup_deviate", 14, + NEW_UV(mntinfop->mik_timers[0].deviate), 0); + hv_store(self, "lookup_rtxcur", 13, + NEW_UV(mntinfop->mik_timers[0].rtxcur), 0); + hv_store(self, "read_srtt", 9, + NEW_UV(mntinfop->mik_timers[1].srtt), 0); + hv_store(self, "read_deviate", 12, + NEW_UV(mntinfop->mik_timers[1].deviate), 0); + hv_store(self, "read_rtxcur", 11, + NEW_UV(mntinfop->mik_timers[1].rtxcur), 0); + hv_store(self, "write_srtt", 10, + NEW_UV(mntinfop->mik_timers[2].srtt), 0); + hv_store(self, "write_deviate", 13, + NEW_UV(mntinfop->mik_timers[2].deviate), 0); + hv_store(self, "write_rtxcur", 12, + NEW_UV(mntinfop->mik_timers[2].rtxcur), 0); + SAVE_UINT32(self, mntinfop, mik_noresponse); + SAVE_UINT32(self, mntinfop, mik_failover); + SAVE_UINT32(self, mntinfop, mik_remap); + SAVE_STRING(self, mntinfop, mik_curserver, strip_str); +} + +/* + * The following struct => hash functions are all only present on the sparc + * platform, so they are all conditionally compiled depending on __sparc + */ + +/* + * Definition in /usr/platform/sun4u/include/vm/hat_sfmmu.h + */ + +#ifdef __sparc +static void +save_sfmmu_global_stat(HV *self, kstat_t *kp, int strip_str) +{ + struct sfmmu_global_stat *sfmmugp; + + /* PERL_ASSERT(kp->ks_ndata == 1); */ + PERL_ASSERT(kp->ks_data_size == sizeof (struct sfmmu_global_stat)); + sfmmugp = (struct sfmmu_global_stat *)(kp->ks_data); + + SAVE_INT32(self, sfmmugp, sf_tsb_exceptions); + SAVE_INT32(self, sfmmugp, sf_tsb_raise_exception); + SAVE_INT32(self, sfmmugp, sf_pagefaults); + SAVE_INT32(self, sfmmugp, sf_uhash_searches); + SAVE_INT32(self, sfmmugp, sf_uhash_links); + SAVE_INT32(self, sfmmugp, sf_khash_searches); + SAVE_INT32(self, sfmmugp, sf_khash_links); + SAVE_INT32(self, sfmmugp, sf_swapout); + SAVE_INT32(self, sfmmugp, sf_tsb_alloc); + SAVE_INT32(self, sfmmugp, sf_tsb_allocfail); + SAVE_INT32(self, sfmmugp, sf_tsb_sectsb_create); + SAVE_INT32(self, sfmmugp, sf_scd_1sttsb_alloc); + SAVE_INT32(self, sfmmugp, sf_scd_2ndtsb_alloc); + SAVE_INT32(self, sfmmugp, sf_scd_1sttsb_allocfail); + SAVE_INT32(self, sfmmugp, sf_scd_2ndtsb_allocfail); + SAVE_INT32(self, sfmmugp, sf_tteload8k); + SAVE_INT32(self, sfmmugp, sf_tteload64k); + SAVE_INT32(self, sfmmugp, sf_tteload512k); + SAVE_INT32(self, sfmmugp, sf_tteload4m); + SAVE_INT32(self, sfmmugp, sf_tteload32m); + SAVE_INT32(self, sfmmugp, sf_tteload256m); + SAVE_INT32(self, sfmmugp, sf_tsb_load8k); + SAVE_INT32(self, sfmmugp, sf_tsb_load4m); + SAVE_INT32(self, sfmmugp, sf_hblk_hit); + SAVE_INT32(self, sfmmugp, sf_hblk8_ncreate); + SAVE_INT32(self, sfmmugp, sf_hblk8_nalloc); + SAVE_INT32(self, sfmmugp, sf_hblk1_ncreate); + SAVE_INT32(self, sfmmugp, sf_hblk1_nalloc); + SAVE_INT32(self, sfmmugp, sf_hblk_slab_cnt); + SAVE_INT32(self, sfmmugp, sf_hblk_reserve_cnt); + SAVE_INT32(self, sfmmugp, sf_hblk_recurse_cnt); + SAVE_INT32(self, sfmmugp, sf_hblk_reserve_hit); + SAVE_INT32(self, sfmmugp, sf_get_free_success); + SAVE_INT32(self, sfmmugp, sf_get_free_throttle); + SAVE_INT32(self, sfmmugp, sf_get_free_fail); + SAVE_INT32(self, sfmmugp, sf_put_free_success); + SAVE_INT32(self, sfmmugp, sf_put_free_fail); + SAVE_INT32(self, sfmmugp, sf_pgcolor_conflict); + SAVE_INT32(self, sfmmugp, sf_uncache_conflict); + SAVE_INT32(self, sfmmugp, sf_unload_conflict); + SAVE_INT32(self, sfmmugp, sf_ism_uncache); + SAVE_INT32(self, sfmmugp, sf_ism_recache); + SAVE_INT32(self, sfmmugp, sf_recache); + SAVE_INT32(self, sfmmugp, sf_steal_count); + SAVE_INT32(self, sfmmugp, sf_pagesync); + SAVE_INT32(self, sfmmugp, sf_clrwrt); + SAVE_INT32(self, sfmmugp, sf_pagesync_invalid); + SAVE_INT32(self, sfmmugp, sf_kernel_xcalls); + SAVE_INT32(self, sfmmugp, sf_user_xcalls); + SAVE_INT32(self, sfmmugp, sf_tsb_grow); + SAVE_INT32(self, sfmmugp, sf_tsb_shrink); + SAVE_INT32(self, sfmmugp, sf_tsb_resize_failures); + SAVE_INT32(self, sfmmugp, sf_tsb_reloc); + SAVE_INT32(self, sfmmugp, sf_user_vtop); + SAVE_INT32(self, sfmmugp, sf_ctx_inv); + SAVE_INT32(self, sfmmugp, sf_tlb_reprog_pgsz); + SAVE_INT32(self, sfmmugp, sf_region_remap_demap); + SAVE_INT32(self, sfmmugp, sf_create_scd); + SAVE_INT32(self, sfmmugp, sf_join_scd); + SAVE_INT32(self, sfmmugp, sf_leave_scd); + SAVE_INT32(self, sfmmugp, sf_destroy_scd); +} +#endif + +/* + * Definition in /usr/platform/sun4u/include/vm/hat_sfmmu.h + */ + +#ifdef __sparc +static void +save_sfmmu_tsbsize_stat(HV *self, kstat_t *kp, int strip_str) +{ + struct sfmmu_tsbsize_stat *sfmmutp; + + /* PERL_ASSERT(kp->ks_ndata == 1); */ + PERL_ASSERT(kp->ks_data_size == sizeof (struct sfmmu_tsbsize_stat)); + sfmmutp = (struct sfmmu_tsbsize_stat *)(kp->ks_data); + + SAVE_INT32(self, sfmmutp, sf_tsbsz_8k); + SAVE_INT32(self, sfmmutp, sf_tsbsz_16k); + SAVE_INT32(self, sfmmutp, sf_tsbsz_32k); + SAVE_INT32(self, sfmmutp, sf_tsbsz_64k); + SAVE_INT32(self, sfmmutp, sf_tsbsz_128k); + SAVE_INT32(self, sfmmutp, sf_tsbsz_256k); + SAVE_INT32(self, sfmmutp, sf_tsbsz_512k); + SAVE_INT32(self, sfmmutp, sf_tsbsz_1m); + SAVE_INT32(self, sfmmutp, sf_tsbsz_2m); + SAVE_INT32(self, sfmmutp, sf_tsbsz_4m); +} +#endif + +/* + * Definition in /usr/platform/sun4u/include/sys/simmstat.h + */ + +#ifdef __sparc +static void +save_simmstat(HV *self, kstat_t *kp, int strip_str) +{ + uchar_t *simmstatp; + SV *list; + int i; + + /* PERL_ASSERT(kp->ks_ndata == 1); */ + PERL_ASSERT(kp->ks_data_size == sizeof (uchar_t) * SIMM_COUNT); + + list = newSVpv("", 0); + for (i = 0, simmstatp = (uchar_t *)(kp->ks_data); + i < SIMM_COUNT - 1; i++, simmstatp++) { + sv_catpvf(list, "%d,", *simmstatp); + } + sv_catpvf(list, "%d", *simmstatp); + hv_store(self, "status", 6, list, 0); +} +#endif + +/* + * Used by save_temperature to make CSV lists from arrays of + * short temperature values + */ + +#ifdef __sparc +static SV * +short_array_to_SV(short *shortp, int len) +{ + SV *list; + + list = newSVpv("", 0); + for (; len > 1; len--, shortp++) { + sv_catpvf(list, "%d,", *shortp); + } + sv_catpvf(list, "%d", *shortp); + return (list); +} + +/* + * Definition in /usr/platform/sun4u/include/sys/fhc.h + */ + +static void +save_temperature(HV *self, kstat_t *kp, int strip_str) +{ + struct temp_stats *tempsp; + + /* PERL_ASSERT(kp->ks_ndata == 1); */ + PERL_ASSERT(kp->ks_data_size == sizeof (struct temp_stats)); + tempsp = (struct temp_stats *)(kp->ks_data); + + SAVE_UINT32(self, tempsp, index); + hv_store(self, "l1", 2, short_array_to_SV(tempsp->l1, L1_SZ), 0); + hv_store(self, "l2", 2, short_array_to_SV(tempsp->l2, L2_SZ), 0); + hv_store(self, "l3", 2, short_array_to_SV(tempsp->l3, L3_SZ), 0); + hv_store(self, "l4", 2, short_array_to_SV(tempsp->l4, L4_SZ), 0); + hv_store(self, "l5", 2, short_array_to_SV(tempsp->l5, L5_SZ), 0); + SAVE_INT32(self, tempsp, max); + SAVE_INT32(self, tempsp, min); + SAVE_INT32(self, tempsp, state); + SAVE_INT32(self, tempsp, temp_cnt); + SAVE_INT32(self, tempsp, shutdown_cnt); + SAVE_INT32(self, tempsp, version); + SAVE_INT32(self, tempsp, trend); + SAVE_INT32(self, tempsp, override); +} +#endif + +/* + * Not actually defined anywhere - just a short. Yuck. + */ + +#ifdef __sparc +static void +save_temp_over(HV *self, kstat_t *kp, int strip_str) +{ + short *shortp; + + /* PERL_ASSERT(kp->ks_ndata == 1); */ + PERL_ASSERT(kp->ks_data_size == sizeof (short)); + + shortp = (short *)(kp->ks_data); + hv_store(self, "override", 8, newSViv(*shortp), 0); +} +#endif + +/* + * Defined in /usr/platform/sun4u/include/sys/sysctrl.h + * (Well, sort of. Actually there's no structure, just a list of #defines + * enumerating *some* of the array indexes.) + */ + +#ifdef __sparc +static void +save_ps_shadow(HV *self, kstat_t *kp, int strip_str) +{ + uchar_t *ucharp; + + /* PERL_ASSERT(kp->ks_ndata == 1); */ + PERL_ASSERT(kp->ks_data_size == SYS_PS_COUNT); + + ucharp = (uchar_t *)(kp->ks_data); + hv_store(self, "core_0", 6, newSViv(*ucharp++), 0); + hv_store(self, "core_1", 6, newSViv(*ucharp++), 0); + hv_store(self, "core_2", 6, newSViv(*ucharp++), 0); + hv_store(self, "core_3", 6, newSViv(*ucharp++), 0); + hv_store(self, "core_4", 6, newSViv(*ucharp++), 0); + hv_store(self, "core_5", 6, newSViv(*ucharp++), 0); + hv_store(self, "core_6", 6, newSViv(*ucharp++), 0); + hv_store(self, "core_7", 6, newSViv(*ucharp++), 0); + hv_store(self, "pps_0", 5, newSViv(*ucharp++), 0); + hv_store(self, "clk_33", 6, newSViv(*ucharp++), 0); + hv_store(self, "clk_50", 6, newSViv(*ucharp++), 0); + hv_store(self, "v5_p", 4, newSViv(*ucharp++), 0); + hv_store(self, "v12_p", 5, newSViv(*ucharp++), 0); + hv_store(self, "v5_aux", 6, newSViv(*ucharp++), 0); + hv_store(self, "v5_p_pch", 8, newSViv(*ucharp++), 0); + hv_store(self, "v12_p_pch", 9, newSViv(*ucharp++), 0); + hv_store(self, "v3_pch", 6, newSViv(*ucharp++), 0); + hv_store(self, "v5_pch", 6, newSViv(*ucharp++), 0); + hv_store(self, "p_fan", 5, newSViv(*ucharp++), 0); +} +#endif + +/* + * Definition in /usr/platform/sun4u/include/sys/fhc.h + */ + +#ifdef __sparc +static void +save_fault_list(HV *self, kstat_t *kp, int strip_str) +{ + struct ft_list *faultp; + int i; + char name[KSTAT_STRLEN + 7]; /* room for 999999 faults */ + + /* PERL_ASSERT(kp->ks_ndata == 1); */ + /* PERL_ASSERT(kp->ks_data_size == sizeof (struct ft_list)); */ + + for (i = 1, faultp = (struct ft_list *)(kp->ks_data); + i <= 999999 && i <= kp->ks_data_size / sizeof (struct ft_list); + i++, faultp++) { + (void) snprintf(name, sizeof (name), "unit_%d", i); + hv_store(self, name, strlen(name), newSViv(faultp->unit), 0); + (void) snprintf(name, sizeof (name), "type_%d", i); + hv_store(self, name, strlen(name), newSViv(faultp->type), 0); + (void) snprintf(name, sizeof (name), "fclass_%d", i); + hv_store(self, name, strlen(name), newSViv(faultp->fclass), 0); + (void) snprintf(name, sizeof (name), "create_time_%d", i); + hv_store(self, name, strlen(name), + NEW_UV(faultp->create_time), 0); + (void) snprintf(name, sizeof (name), "msg_%d", i); + hv_store(self, name, strlen(name), newSVpv(faultp->msg, 0), 0); + } +} +#endif + +/* + * We need to be able to find the function corresponding to a particular raw + * kstat. To do this we ignore the instance and glue the module and name + * together to form a composite key. We can then use the data in the kstat + * structure to find the appropriate function. We use a perl hash to manage the + * lookup, where the key is "module:name" and the value is a pointer to the + * appropriate C function. + * + * Note that some kstats include the instance number as part of the module + * and/or name. This could be construed as a bug. However, to work around this + * we omit any digits from the module and name as we build the table in + * build_raw_kstat_loopup(), and we remove any digits from the module and name + * when we look up the functions in lookup_raw_kstat_fn() + */ + +/* + * This function is called when the XS is first dlopen()ed, and builds the + * lookup table as described above. + */ + +static void +build_raw_kstat_lookup() + { + /* Create new hash */ + raw_kstat_lookup = newHV(); + + SAVE_FNP(raw_kstat_lookup, save_cpu_stat, "cpu_stat:cpu_stat"); + SAVE_FNP(raw_kstat_lookup, save_var, "unix:var"); + SAVE_FNP(raw_kstat_lookup, save_flushmeter, "unix:flushmeter"); + SAVE_FNP(raw_kstat_lookup, save_ncstats, "unix:ncstats"); + SAVE_FNP(raw_kstat_lookup, save_sysinfo, "unix:sysinfo"); + SAVE_FNP(raw_kstat_lookup, save_vminfo, "unix:vminfo"); + SAVE_FNP(raw_kstat_lookup, save_nfs, "nfs:mntinfo"); +#ifdef __sparc + SAVE_FNP(raw_kstat_lookup, save_sfmmu_global_stat, + "unix:sfmmu_global_stat"); + SAVE_FNP(raw_kstat_lookup, save_sfmmu_tsbsize_stat, + "unix:sfmmu_tsbsize_stat"); + SAVE_FNP(raw_kstat_lookup, save_simmstat, "unix:simm-status"); + SAVE_FNP(raw_kstat_lookup, save_temperature, "unix:temperature"); + SAVE_FNP(raw_kstat_lookup, save_temp_over, "unix:temperature override"); + SAVE_FNP(raw_kstat_lookup, save_ps_shadow, "unix:ps_shadow"); + SAVE_FNP(raw_kstat_lookup, save_fault_list, "unix:fault_list"); +#endif +} + +/* + * This finds and returns the raw kstat reader function corresponding to the + * supplied module and name. If no matching function exists, 0 is returned. + */ + +static kstat_raw_reader_t lookup_raw_kstat_fn(char *module, char *name) + { + char key[KSTAT_STRLEN * 2]; + register char *f, *t; + SV **entry; + kstat_raw_reader_t fnp; + + /* Copy across module & name, removing any digits - see comment above */ + for (f = module, t = key; *f != '\0'; f++, t++) { + while (*f != '\0' && isdigit(*f)) { f++; } + *t = *f; + } + *t++ = ':'; + for (f = name; *f != '\0'; f++, t++) { + while (*f != '\0' && isdigit(*f)) { + f++; + } + *t = *f; + } + *t = '\0'; + + /* look up & return the function, or teturn 0 if not found */ + if ((entry = hv_fetch(raw_kstat_lookup, key, strlen(key), FALSE)) == 0) + { + fnp = 0; + } else { + fnp = (kstat_raw_reader_t)(uintptr_t)SvIV(*entry); + } + return (fnp); +} + +/* + * This module converts the flat list returned by kstat_read() into a perl hash + * tree keyed on module, instance, name and statistic. The following functions + * provide code to create the nested hashes, and to iterate over them. + */ + +/* + * Given module, instance and name keys return a pointer to the hash tied to + * the bottommost hash. If the hash already exists, we just return a pointer + * to it, otherwise we create the hash and any others also required above it in + * the hierarchy. The returned tiehash is blessed into the + * Sun::Solaris::Kstat::_Stat class, so that the appropriate TIEHASH methods are + * called when the bottommost hash is accessed. If the is_new parameter is + * non-null it will be set to TRUE if a new tie has been created, and FALSE if + * the tie already existed. + */ + +static HV * +get_tie(SV *self, char *module, int instance, char *name, int *is_new) +{ + char str_inst[11]; /* big enough for up to 10^10 instances */ + char *key[3]; /* 3 part key: module, instance, name */ + int k; + int new; + HV *hash; + HV *tie; + + /* Create the keys */ + (void) snprintf(str_inst, sizeof (str_inst), "%d", instance); + key[0] = module; + key[1] = str_inst; + key[2] = name; + + /* Iteratively descend the tree, creating new hashes as required */ + hash = (HV *)SvRV(self); + for (k = 0; k < 3; k++) { + SV **entry; + + SvREADONLY_off(hash); + entry = hv_fetch(hash, key[k], strlen(key[k]), TRUE); + + /* If the entry doesn't exist, create it */ + if (! SvOK(*entry)) { + HV *newhash; + SV *rv; + + newhash = newHV(); + rv = newRV_noinc((SV *)newhash); + sv_setsv(*entry, rv); + SvREFCNT_dec(rv); + if (k < 2) { + SvREADONLY_on(newhash); + } + SvREADONLY_on(*entry); + SvREADONLY_on(hash); + hash = newhash; + new = 1; + + /* Otherwise it already existed */ + } else { + SvREADONLY_on(hash); + hash = (HV *)SvRV(*entry); + new = 0; + } + } + + /* Create and bless a hash for the tie, if necessary */ + if (new) { + SV *tieref; + HV *stash; + + tie = newHV(); + tieref = newRV_noinc((SV *)tie); + stash = gv_stashpv("Sun::Solaris::Kstat::_Stat", TRUE); + sv_bless(tieref, stash); + + /* Add TIEHASH magic */ + hv_magic(hash, (GV *)tieref, 'P'); + SvREADONLY_on(hash); + + /* Otherwise, just find the existing tied hash */ + } else { + MAGIC *mg; + + mg = mg_find((SV *)hash, 'P'); + PERL_ASSERTMSG(mg != 0, "get_tie: lost P magic"); + tie = (HV *)SvRV(mg->mg_obj); + } + if (is_new) { + *is_new = new; + } + return (tie); +} + +/* + * This is an iterator function used to traverse the hash hierarchy and apply + * the passed function to the tied hashes at the bottom of the hierarchy. If + * any of the callback functions return 0, 0 is returned, otherwise 1 + */ + +static int +apply_to_ties(SV *self, ATTCb_t cb, void *arg) +{ + HV *hash1; + HE *entry1; + long s; + int ret; + + hash1 = (HV *)SvRV(self); + hv_iterinit(hash1); + ret = 1; + + /* Iterate over each module */ + while (entry1 = hv_iternext(hash1)) { + HV *hash2; + HE *entry2; + + hash2 = (HV *)SvRV(hv_iterval(hash1, entry1)); + hv_iterinit(hash2); + + /* Iterate over each module:instance */ + while (entry2 = hv_iternext(hash2)) { + HV *hash3; + HE *entry3; + + hash3 = (HV *)SvRV(hv_iterval(hash2, entry2)); + hv_iterinit(hash3); + + /* Iterate over each module:instance:name */ + while (entry3 = hv_iternext(hash3)) { + HV *hash4; + MAGIC *mg; + HV *tie; + + /* Get the tie */ + hash4 = (HV *)SvRV(hv_iterval(hash3, entry3)); + mg = mg_find((SV *)hash4, 'P'); + PERL_ASSERTMSG(mg != 0, + "apply_to_ties: lost P magic"); + + /* Apply the callback */ + if (! cb((HV *)SvRV(mg->mg_obj), arg)) { + ret = 0; + } + } + } + } + return (ret); +} + +/* + * Mark this HV as valid - used by update() when pruning deleted kstat nodes + */ + +static int +set_valid(HV *self, void *arg) +{ + MAGIC *mg; + + mg = mg_find((SV *)self, '~'); + PERL_ASSERTMSG(mg != 0, "set_valid: lost ~ magic"); + ((KstatInfo_t *)SvPVX(mg->mg_obj))->valid = (int)arg; + return (1); +} + +/* + * Prune invalid kstat nodes. This is called when kstat_chain_update() detects + * that the kstat chain has been updated. This removes any hash tree entries + * that no longer have a corresponding kstat. If del is non-null it will be + * set to the keys of the deleted kstat nodes, if any. If any entries are + * deleted 1 will be retured, otherwise 0 + */ + +static int +prune_invalid(SV *self, AV *del) +{ + HV *hash1; + HE *entry1; + STRLEN klen; + char *module, *instance, *name, *key; + int ret; + + hash1 = (HV *)SvRV(self); + hv_iterinit(hash1); + ret = 0; + + /* Iterate over each module */ + while (entry1 = hv_iternext(hash1)) { + HV *hash2; + HE *entry2; + + module = HePV(entry1, PL_na); + hash2 = (HV *)SvRV(hv_iterval(hash1, entry1)); + hv_iterinit(hash2); + + /* Iterate over each module:instance */ + while (entry2 = hv_iternext(hash2)) { + HV *hash3; + HE *entry3; + + instance = HePV(entry2, PL_na); + hash3 = (HV *)SvRV(hv_iterval(hash2, entry2)); + hv_iterinit(hash3); + + /* Iterate over each module:instance:name */ + while (entry3 = hv_iternext(hash3)) { + HV *hash4; + MAGIC *mg; + HV *tie; + + name = HePV(entry3, PL_na); + hash4 = (HV *)SvRV(hv_iterval(hash3, entry3)); + mg = mg_find((SV *)hash4, 'P'); + PERL_ASSERTMSG(mg != 0, + "prune_invalid: lost P magic"); + tie = (HV *)SvRV(mg->mg_obj); + mg = mg_find((SV *)tie, '~'); + PERL_ASSERTMSG(mg != 0, + "prune_invalid: lost ~ magic"); + + /* If this is marked as invalid, prune it */ + if (((KstatInfo_t *)SvPVX( + (SV *)mg->mg_obj))->valid == FALSE) { + SvREADONLY_off(hash3); + key = HePV(entry3, klen); + hv_delete(hash3, key, klen, G_DISCARD); + SvREADONLY_on(hash3); + if (del) { + av_push(del, + newSVpvf("%s:%s:%s", + module, instance, name)); + } + ret = 1; + } + } + + /* If the module:instance:name hash is empty prune it */ + if (HvKEYS(hash3) == 0) { + SvREADONLY_off(hash2); + key = HePV(entry2, klen); + hv_delete(hash2, key, klen, G_DISCARD); + SvREADONLY_on(hash2); + } + } + /* If the module:instance hash is empty prune it */ + if (HvKEYS(hash2) == 0) { + SvREADONLY_off(hash1); + key = HePV(entry1, klen); + hv_delete(hash1, key, klen, G_DISCARD); + SvREADONLY_on(hash1); + } + } + return (ret); +} + +/* + * Named kstats are returned as a list of key/values. This function converts + * such a list into the equivalent perl datatypes, and stores them in the passed + * hash. + */ + +static void +save_named(HV *self, kstat_t *kp, int strip_str) +{ + kstat_named_t *knp; + int n; + SV* value; + + for (n = kp->ks_ndata, knp = KSTAT_NAMED_PTR(kp); n > 0; n--, knp++) { + switch (knp->data_type) { + case KSTAT_DATA_CHAR: + value = newSVpv(knp->value.c, strip_str ? + strlen(knp->value.c) : sizeof (knp->value.c)); + break; + case KSTAT_DATA_INT32: + value = newSViv(knp->value.i32); + break; + case KSTAT_DATA_UINT32: + value = NEW_UV(knp->value.ui32); + break; + case KSTAT_DATA_INT64: + value = NEW_UV(knp->value.i64); + break; + case KSTAT_DATA_UINT64: + value = NEW_UV(knp->value.ui64); + break; + case KSTAT_DATA_STRING: + if (KSTAT_NAMED_STR_PTR(knp) == NULL) + value = newSVpv("null", sizeof ("null") - 1); + else + value = newSVpv(KSTAT_NAMED_STR_PTR(knp), + KSTAT_NAMED_STR_BUFLEN(knp) -1); + break; + default: + PERL_ASSERTMSG(0, "kstat_read: invalid data type"); + break; + } + hv_store(self, knp->name, strlen(knp->name), value, 0); + } +} + +/* + * Save kstat interrupt statistics + */ + +static void +save_intr(HV *self, kstat_t *kp, int strip_str) +{ + kstat_intr_t *kintrp; + int i; + static char *intr_names[] = + { "hard", "soft", "watchdog", "spurious", "multiple_service" }; + + PERL_ASSERT(kp->ks_ndata == 1); + PERL_ASSERT(kp->ks_data_size == sizeof (kstat_intr_t)); + kintrp = KSTAT_INTR_PTR(kp); + + for (i = 0; i < KSTAT_NUM_INTRS; i++) { + hv_store(self, intr_names[i], strlen(intr_names[i]), + NEW_UV(kintrp->intrs[i]), 0); + } +} + +/* + * Save IO statistics + */ + +static void +save_io(HV *self, kstat_t *kp, int strip_str) +{ + kstat_io_t *kiop; + + PERL_ASSERT(kp->ks_ndata == 1); + PERL_ASSERT(kp->ks_data_size == sizeof (kstat_io_t)); + kiop = KSTAT_IO_PTR(kp); + SAVE_UINT64(self, kiop, nread); + SAVE_UINT64(self, kiop, nwritten); + SAVE_UINT32(self, kiop, reads); + SAVE_UINT32(self, kiop, writes); + SAVE_HRTIME(self, kiop, wtime); + SAVE_HRTIME(self, kiop, wlentime); + SAVE_HRTIME(self, kiop, wlastupdate); + SAVE_HRTIME(self, kiop, rtime); + SAVE_HRTIME(self, kiop, rlentime); + SAVE_HRTIME(self, kiop, rlastupdate); + SAVE_UINT32(self, kiop, wcnt); + SAVE_UINT32(self, kiop, rcnt); +} + +/* + * Save timer statistics + */ + +static void +save_timer(HV *self, kstat_t *kp, int strip_str) +{ + kstat_timer_t *ktimerp; + + PERL_ASSERT(kp->ks_ndata == 1); + PERL_ASSERT(kp->ks_data_size == sizeof (kstat_timer_t)); + ktimerp = KSTAT_TIMER_PTR(kp); + SAVE_STRING(self, ktimerp, name, strip_str); + SAVE_UINT64(self, ktimerp, num_events); + SAVE_HRTIME(self, ktimerp, elapsed_time); + SAVE_HRTIME(self, ktimerp, min_time); + SAVE_HRTIME(self, ktimerp, max_time); + SAVE_HRTIME(self, ktimerp, start_time); + SAVE_HRTIME(self, ktimerp, stop_time); +} + +/* + * Read kstats and copy into the supplied perl hash structure. If refresh is + * true, this function is being called as part of the update() method. In this + * case it is only necessary to read the kstats if they have previously been + * accessed (kip->read == TRUE). If refresh is false, this function is being + * called prior to returning a value to the caller. In this case, it is only + * necessary to read the kstats if they have not previously been read. If the + * kstat_read() fails, 0 is returned, otherwise 1 + */ + +static int +read_kstats(HV *self, int refresh) +{ + MAGIC *mg; + KstatInfo_t *kip; + kstat_raw_reader_t fnp; + + /* Find the MAGIC KstatInfo_t data structure */ + mg = mg_find((SV *)self, '~'); + PERL_ASSERTMSG(mg != 0, "read_kstats: lost ~ magic"); + kip = (KstatInfo_t *)SvPVX(mg->mg_obj); + + /* Return early if we don't need to actually read the kstats */ + if ((refresh && ! kip->read) || (! refresh && kip->read)) { + return (1); + } + + /* Read the kstats and return 0 if this fails */ + if (kstat_read(kip->kstat_ctl, kip->kstat, NULL) < 0) { + return (0); + } + + /* Save the read data */ + hv_store(self, "snaptime", 8, NEW_HRTIME(kip->kstat->ks_snaptime), 0); + switch (kip->kstat->ks_type) { + case KSTAT_TYPE_RAW: + if ((fnp = lookup_raw_kstat_fn(kip->kstat->ks_module, + kip->kstat->ks_name)) != 0) { + fnp(self, kip->kstat, kip->strip_str); + } + break; + case KSTAT_TYPE_NAMED: + save_named(self, kip->kstat, kip->strip_str); + break; + case KSTAT_TYPE_INTR: + save_intr(self, kip->kstat, kip->strip_str); + break; + case KSTAT_TYPE_IO: + save_io(self, kip->kstat, kip->strip_str); + break; + case KSTAT_TYPE_TIMER: + save_timer(self, kip->kstat, kip->strip_str); + break; + default: + PERL_ASSERTMSG(0, "read_kstats: illegal kstat type"); + break; + } + kip->read = TRUE; + return (1); +} + +/* + * The XS code exported to perl is below here. Note that the XS preprocessor + * has its own commenting syntax, so all comments from this point on are in + * that form. + */ + +/* The following XS methods are the ABI of the Sun::Solaris::Kstat package */ + +MODULE = Sun::Solaris::Kstat PACKAGE = Sun::Solaris::Kstat +PROTOTYPES: ENABLE + + # Create the raw kstat to store function lookup table on load +BOOT: + build_raw_kstat_lookup(); + + # + # The Sun::Solaris::Kstat constructor. This builds the nested + # name::instance::module hash structure, but doesn't actually read the + # underlying kstats. This is done on demand by the TIEHASH methods in + # Sun::Solaris::Kstat::_Stat + # + +SV* +new(class, ...) + char *class; +PREINIT: + HV *stash; + kstat_ctl_t *kc; + SV *kcsv; + kstat_t *kp; + KstatInfo_t kstatinfo; + int sp, strip_str; +CODE: + /* Check we have an even number of arguments, excluding the class */ + sp = 1; + if (((items - sp) % 2) != 0) { + croak(DEBUG_ID ": new: invalid number of arguments"); + } + + /* Process any (name => value) arguments */ + strip_str = 0; + while (sp < items) { + SV *name, *value; + + name = ST(sp); + sp++; + value = ST(sp); + sp++; + if (strcmp(SvPVX(name), "strip_strings") == 0) { + strip_str = SvTRUE(value); + } else { + croak(DEBUG_ID ": new: invalid parameter name '%s'", + SvPVX(name)); + } + } + + /* Open the kstats handle */ + if ((kc = kstat_open()) == 0) { + XSRETURN_UNDEF; + } + + /* Create a blessed hash ref */ + RETVAL = (SV *)newRV_noinc((SV *)newHV()); + stash = gv_stashpv(class, TRUE); + sv_bless(RETVAL, stash); + + /* Create a place to save the KstatInfo_t structure */ + kcsv = newSVpv((char *)&kc, sizeof (kc)); + sv_magic(SvRV(RETVAL), kcsv, '~', 0, 0); + SvREFCNT_dec(kcsv); + + /* Initialise the KstatsInfo_t structure */ + kstatinfo.read = FALSE; + kstatinfo.valid = TRUE; + kstatinfo.strip_str = strip_str; + kstatinfo.kstat_ctl = kc; + + /* Scan the kstat chain, building hash entries for the kstats */ + for (kp = kc->kc_chain; kp != 0; kp = kp->ks_next) { + HV *tie; + SV *kstatsv; + + /* Don't bother storing the kstat headers */ + if (strncmp(kp->ks_name, "kstat_", 6) == 0) { + continue; + } + + /* Don't bother storing raw stats we don't understand */ + if (kp->ks_type == KSTAT_TYPE_RAW && + lookup_raw_kstat_fn(kp->ks_module, kp->ks_name) == 0) { +#ifdef REPORT_UNKNOWN + (void) fprintf(stderr, + "Unknown kstat type %s:%d:%s - %d of size %d\n", + kp->ks_module, kp->ks_instance, kp->ks_name, + kp->ks_ndata, kp->ks_data_size); +#endif + continue; + } + + /* Create a 3-layer hash hierarchy - module.instance.name */ + tie = get_tie(RETVAL, kp->ks_module, kp->ks_instance, + kp->ks_name, 0); + + /* Save the data necessary to read the kstat info on demand */ + hv_store(tie, "class", 5, newSVpv(kp->ks_class, 0), 0); + hv_store(tie, "crtime", 6, NEW_HRTIME(kp->ks_crtime), 0); + kstatinfo.kstat = kp; + kstatsv = newSVpv((char *)&kstatinfo, sizeof (kstatinfo)); + sv_magic((SV *)tie, kstatsv, '~', 0, 0); + SvREFCNT_dec(kstatsv); + } + SvREADONLY_on(SvRV(RETVAL)); + /* SvREADONLY_on(RETVAL); */ +OUTPUT: + RETVAL + + # + # Update the perl hash structure so that it is in line with the kernel kstats + # data. Only kstats athat have previously been accessed are read, + # + + # Scalar context: true/false + # Array context: (\@added, \@deleted) +void +update(self) + SV* self; +PREINIT: + MAGIC *mg; + kstat_ctl_t *kc; + kstat_t *kp; + int ret; + AV *add, *del; +PPCODE: + /* Find the hidden KstatInfo_t structure */ + mg = mg_find(SvRV(self), '~'); + PERL_ASSERTMSG(mg != 0, "update: lost ~ magic"); + kc = *(kstat_ctl_t **)SvPVX(mg->mg_obj); + + /* Update the kstat chain, and return immediately on error. */ + if ((ret = kstat_chain_update(kc)) == -1) { + if (GIMME_V == G_ARRAY) { + EXTEND(SP, 2); + PUSHs(sv_newmortal()); + PUSHs(sv_newmortal()); + } else { + EXTEND(SP, 1); + PUSHs(sv_2mortal(newSViv(ret))); + } + } + + /* Create the arrays to be returned if in an array context */ + if (GIMME_V == G_ARRAY) { + add = newAV(); + del = newAV(); + } else { + add = 0; + del = 0; + } + + /* + * If the kstat chain hasn't changed we can just reread any stats + * that have already been read + */ + if (ret == 0) { + if (! apply_to_ties(self, (ATTCb_t)read_kstats, (void *)TRUE)) { + if (GIMME_V == G_ARRAY) { + EXTEND(SP, 2); + PUSHs(sv_2mortal(newRV_noinc((SV *)add))); + PUSHs(sv_2mortal(newRV_noinc((SV *)del))); + } else { + EXTEND(SP, 1); + PUSHs(sv_2mortal(newSViv(-1))); + } + } + + /* + * Otherwise we have to update the Perl structure so that it is in + * agreement with the new kstat chain. We do this in such a way as to + * retain all the existing structures, just adding or deleting the + * bare minimum. + */ + } else { + KstatInfo_t kstatinfo; + + /* + * Step 1: set the 'invalid' flag on each entry + */ + apply_to_ties(self, &set_valid, (void *)FALSE); + + /* + * Step 2: Set the 'valid' flag on all entries still in the + * kernel kstat chain + */ + kstatinfo.read = FALSE; + kstatinfo.valid = TRUE; + kstatinfo.kstat_ctl = kc; + for (kp = kc->kc_chain; kp != 0; kp = kp->ks_next) { + int new; + HV *tie; + + /* Don't bother storing the kstat headers or types */ + if (strncmp(kp->ks_name, "kstat_", 6) == 0) { + continue; + } + + /* Don't bother storing raw stats we don't understand */ + if (kp->ks_type == KSTAT_TYPE_RAW && + lookup_raw_kstat_fn(kp->ks_module, kp->ks_name) + == 0) { +#ifdef REPORT_UNKNOWN + (void) printf("Unknown kstat type %s:%d:%s " + "- %d of size %d\n", kp->ks_module, + kp->ks_instance, kp->ks_name, + kp->ks_ndata, kp->ks_data_size); +#endif + continue; + } + + /* Find the tied hash associated with the kstat entry */ + tie = get_tie(self, kp->ks_module, kp->ks_instance, + kp->ks_name, &new); + + /* If newly created store the associated kstat info */ + if (new) { + SV *kstatsv; + + /* + * Save the data necessary to read the kstat + * info on demand + */ + hv_store(tie, "class", 5, + newSVpv(kp->ks_class, 0), 0); + hv_store(tie, "crtime", 6, + NEW_HRTIME(kp->ks_crtime), 0); + kstatinfo.kstat = kp; + kstatsv = newSVpv((char *)&kstatinfo, + sizeof (kstatinfo)); + sv_magic((SV *)tie, kstatsv, '~', 0, 0); + SvREFCNT_dec(kstatsv); + + /* Save the key on the add list, if required */ + if (GIMME_V == G_ARRAY) { + av_push(add, newSVpvf("%s:%d:%s", + kp->ks_module, kp->ks_instance, + kp->ks_name)); + } + + /* If the stats already exist, just update them */ + } else { + MAGIC *mg; + KstatInfo_t *kip; + + /* Find the hidden KstatInfo_t */ + mg = mg_find((SV *)tie, '~'); + PERL_ASSERTMSG(mg != 0, "update: lost ~ magic"); + kip = (KstatInfo_t *)SvPVX(mg->mg_obj); + + /* Mark the tie as valid */ + kip->valid = TRUE; + + /* Re-save the kstat_t pointer. If the kstat + * has been deleted and re-added since the last + * update, the address of the kstat structure + * will have changed, even though the kstat will + * still live at the same place in the perl + * hash tree structure. + */ + kip->kstat = kp; + + /* Reread the stats, if read previously */ + read_kstats(tie, TRUE); + } + } + + /* + *Step 3: Delete any entries still marked as 'invalid' + */ + ret = prune_invalid(self, del); + + } + if (GIMME_V == G_ARRAY) { + EXTEND(SP, 2); + PUSHs(sv_2mortal(newRV_noinc((SV *)add))); + PUSHs(sv_2mortal(newRV_noinc((SV *)del))); + } else { + EXTEND(SP, 1); + PUSHs(sv_2mortal(newSViv(ret))); + } + + + # + # Destructor. Closes the kstat connection + # + +void +DESTROY(self) + SV *self; +PREINIT: + MAGIC *mg; + kstat_ctl_t *kc; +CODE: + mg = mg_find(SvRV(self), '~'); + PERL_ASSERTMSG(mg != 0, "DESTROY: lost ~ magic"); + kc = *(kstat_ctl_t **)SvPVX(mg->mg_obj); + if (kstat_close(kc) != 0) { + croak(DEBUG_ID ": kstat_close: failed"); + } + + # + # The following XS methods implement the TIEHASH mechanism used to update the + # kstats hash structure. These are blessed into a package that isn't + # visible to callers of the Sun::Solaris::Kstat module + # + +MODULE = Sun::Solaris::Kstat PACKAGE = Sun::Solaris::Kstat::_Stat +PROTOTYPES: ENABLE + + # + # If a value has already been read, return it. Otherwise read the appropriate + # kstat and then return the value + # + +SV* +FETCH(self, key) + SV* self; + SV* key; +PREINIT: + char *k; + STRLEN klen; + SV **value; +CODE: + self = SvRV(self); + k = SvPV(key, klen); + if (strNE(k, "class") && strNE(k, "crtime")) { + read_kstats((HV *)self, FALSE); + } + value = hv_fetch((HV *)self, k, klen, FALSE); + if (value) { + RETVAL = *value; SvREFCNT_inc(RETVAL); + } else { + RETVAL = &PL_sv_undef; + } +OUTPUT: + RETVAL + + # + # Save the passed value into the kstat hash. Read the appropriate kstat first, + # if necessary. Note that this DOES NOT update the underlying kernel kstat + # structure. + # + +SV* +STORE(self, key, value) + SV* self; + SV* key; + SV* value; +PREINIT: + char *k; + STRLEN klen; +CODE: + self = SvRV(self); + k = SvPV(key, klen); + if (strNE(k, "class") && strNE(k, "crtime")) { + read_kstats((HV *)self, FALSE); + } + SvREFCNT_inc(value); + RETVAL = *(hv_store((HV *)self, k, klen, value, 0)); + SvREFCNT_inc(RETVAL); +OUTPUT: + RETVAL + + # + # Check for the existence of the passed key. Read the kstat first if necessary + # + +bool +EXISTS(self, key) + SV* self; + SV* key; +PREINIT: + char *k; +CODE: + self = SvRV(self); + k = SvPV(key, PL_na); + if (strNE(k, "class") && strNE(k, "crtime")) { + read_kstats((HV *)self, FALSE); + } + RETVAL = hv_exists_ent((HV *)self, key, 0); +OUTPUT: + RETVAL + + + # + # Hash iterator initialisation. Read the kstats if necessary. + # + +SV* +FIRSTKEY(self) + SV* self; +PREINIT: + HE *he; +PPCODE: + self = SvRV(self); + read_kstats((HV *)self, FALSE); + hv_iterinit((HV *)self); + if (he = hv_iternext((HV *)self)) { + EXTEND(SP, 1); + PUSHs(hv_iterkeysv(he)); + } + + # + # Return hash iterator next value. Read the kstats if necessary. + # + +SV* +NEXTKEY(self, lastkey) + SV* self; + SV* lastkey; +PREINIT: + HE *he; +PPCODE: + self = SvRV(self); + if (he = hv_iternext((HV *)self)) { + EXTEND(SP, 1); + PUSHs(hv_iterkeysv(he)); + } + + + # + # Delete the specified hash entry. + # + +SV* +DELETE(self, key) + SV *self; + SV *key; +CODE: + self = SvRV(self); + RETVAL = hv_delete_ent((HV *)self, key, 0, 0); + if (RETVAL) { + SvREFCNT_inc(RETVAL); + } else { + RETVAL = &PL_sv_undef; + } +OUTPUT: + RETVAL + + # + # Clear the entire hash. This will stop any update() calls rereading this + # kstat until it is accessed again. + # + +void +CLEAR(self) + SV* self; +PREINIT: + MAGIC *mg; + KstatInfo_t *kip; +CODE: + self = SvRV(self); + hv_clear((HV *)self); + mg = mg_find(self, '~'); + PERL_ASSERTMSG(mg != 0, "CLEAR: lost ~ magic"); + kip = (KstatInfo_t *)SvPVX(mg->mg_obj); + kip->read = FALSE; + kip->valid = TRUE; + hv_store((HV *)self, "class", 5, newSVpv(kip->kstat->ks_class, 0), 0); + hv_store((HV *)self, "crtime", 6, NEW_HRTIME(kip->kstat->ks_crtime), 0); diff -r 000000000000 -r fd074940082c perl/intrd.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/perl/intrd.pl Mon Apr 19 14:25:41 2010 -0400 @@ -0,0 +1,1408 @@ +#!/usr/perl5/bin/perl +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +require 5.8.4; +use strict; +use warnings; +use POSIX; +use File::Basename("basename"); + +my $cmdname = basename($0); + +my $using_scengen = 0; # 1 if using scenario simulator +my $debug = 0; + +my $normal_sleeptime = 10; # time to sleep between samples +my $idle_sleeptime = 45; # time to sleep when idle +my $onecpu_sleeptime = (60 * 15); # used if only 1 CPU on system +my $sleeptime = $normal_sleeptime; # either normal_ or idle_ or onecpu_ + +my $idle_intrload = .1; # idle if interrupt load < 10% + +my $timerange_toohi = .01; +my $statslen = 60; # time period (in secs) to keep in @deltas + + +# Parse arguments. intrd does not accept any public arguments; the two +# arguments below are meant for testing purposes. -D generates a significant +# amount of syslog output. -S loads the filename as a perl +# script. That file is expected to implement a kstat "simulator" which +# can be used to feed information to intrd and verify intrd's responses. + +while ($_ = shift @ARGV) { + if ($_ eq "-S" && $#ARGV != -1) { + $using_scengen = 1; + do $ARGV[0]; # load simulator + shift @ARGV; + } elsif ($_ eq "-D") { + $debug = 1; + } +} + +if ($using_scengen == 0) { + require Sun::Solaris::Kstat; + require Sun::Solaris::Intrs; + import Sun::Solaris::Intrs(qw(intrmove is_pcplusmp)); + require Sys::Syslog; + import Sys::Syslog; + openlog($cmdname, 'pid', 'daemon'); + setlogmask(Sys::Syslog::LOG_UPTO($debug > 0 ? &Sys::Syslog::LOG_DEBUG : + &Sys::Syslog::LOG_INFO)); +} + +my $asserted = 0; +my $assert_level = 'debug'; # syslog level for assertion failures +sub VERIFY($@) +{ + my $bad = (shift() == 0); # $_[0] == 0 means assert failed + if ($bad) { + my $msg = shift(); + syslog($assert_level, "VERIFY: $msg", @_); + $asserted++; + } + return ($bad); +} + + + + +sub getstat($$); +sub generate_delta($$); +sub compress_deltas($); +sub dumpdelta($); + +sub goodness($); +sub imbalanced($$); +sub do_reconfig($); + +sub goodness_cpu($$); # private function +sub move_intr($$$$); # private function +sub ivecs_to_string(@); # private function +sub do_find_goal($$$$); # private function +sub find_goal($$); # private function +sub do_reconfig_cpu2cpu($$$$); # private function +sub do_reconfig_cpu($$$); # private function + + +# +# What follow are the basic data structures routines of intrd. +# +# getstat() is responsible for reading the kstats and generating a "stat" hash. +# +# generate_delta() is responsible for taking two "stat" hashes and creating +# a new "delta" hash that represents what has changed over time. +# +# compress_deltas() is responsible for taking a list of deltas and generating +# a single delta hash that encompasses all the time periods described by the +# deltas. + + +# +# getstat() is handed a reference to a kstat and generates a hash, returned +# by reference, containing all the fields from the kstats which we need. +# If it returns the scalar 0, it failed to gather the kstats, and the caller +# should react accordingly. +# +# getstat() is also responsible for maintaining a reasonable $sleeptime. +# +# {"snaptime"} kstat's snaptime +# {} one hash reference per online cpu +# ->{"tot"} == cpu::sys:cpu_nsec_{user + kernel + idle} +# ->{"crtime"} == cpu::sys:crtime +# ->{"ivecs"} +# ->{} iterates over pci_intrs:::cookie +# ->{"time"} == pci_intrs:::time (in nsec) +# ->{"pil"} == pci_intrs:::pil +# ->{"crtime"} == pci_intrs:::crtime +# ->{"ino"} == pci_intrs:::ino +# ->{"num_ino"} == num inos of single device instance sharing this entry +# Will be > 1 on pcplusmp X86 systems for devices +# with multiple MSI interrupts. +# ->{"buspath"} == pci_intrs:::buspath +# ->{"name"} == pci_intrs:::name +# ->{"ihs"} == pci_intrs:::ihs +# + +sub getstat($$) +{ + my ($ks, $pcplusmp_sys) = @_; + + my $cpucnt = 0; + my %stat = (); + my ($minsnap, $maxsnap); + + # Hash of hash which matches (MSI device, ino) combos to kstats. + my %msidevs = (); + + # kstats are not generated atomically. Each kstat hierarchy will + # have been generated within the kernel at a different time. On a + # thrashing system, we may not run quickly enough in order to get + # coherent kstat timing information across all the kstats. To + # determine if this is occurring, $minsnap/$maxsnap are used to + # find the breadth between the first and last snaptime of all the + # kstats we access. $maxsnap - $minsnap roughly represents the + # total time taken up in getstat(). If this time approaches the + # time between snapshots, our results may not be useful. + + $minsnap = -1; # snaptime is always a positive number + $maxsnap = $minsnap; + + # Iterate over the cpus in cpu:::. Check + # cpu_info::cpu_info:state to make sure the + # processor is "on-line". If not, it isn't accepting interrupts + # and doesn't concern us. + # + # Record cpu::sys:snaptime, and check $minsnap/$maxsnap. + + while (my ($cpu, $cpst) = each %{$ks->{cpu}}) { + next if !exists($ks->{cpu_info}{$cpu}{"cpu_info$cpu"}{state}); + #"state" fld of kstat w/ + # modname inst name-"cpuinfo0" + my $state = $ks->{cpu_info}{$cpu}{"cpu_info$cpu"}{state}; + next if ($state !~ /^on-line\0/); + my $cpu_sys = $cpst->{sys}; + + $stat{$cpu}{tot} = ($cpu_sys->{cpu_nsec_idle} + + $cpu_sys->{cpu_nsec_user} + + $cpu_sys->{cpu_nsec_kernel}); + $stat{$cpu}{crtime} = $cpu_sys->{crtime}; + $stat{$cpu}{ivecs} = {}; + + if ($minsnap == -1 || $cpu_sys->{snaptime} < $minsnap) { + $minsnap = $cpu_sys->{snaptime}; + } + if ($cpu_sys->{snaptime} > $maxsnap) { + $maxsnap = $cpu_sys->{snaptime}; + } + $cpucnt++; + } + + if ($cpucnt <= 1) { + $sleeptime = $onecpu_sleeptime; + return (0); # nothing to do with 1 CPU + } + + # Iterate over the ivecs. If the cpu is not on-line, ignore the + # ivecs mapped to it, if any. + # + # Record pci_intrs:{inum}::time, snaptime, crtime, pil, + # ino, name, and buspath. Check $minsnap/$maxsnap. + + foreach my $inst (values(%{$ks->{pci_intrs}})) { + my $intrcfg = (values(%$inst))[0]; + my $cpu = $intrcfg->{cpu}; + + next unless exists $stat{$cpu}; + next if ($intrcfg->{type} =~ /^disabled\0/); + + # Perl looks beyond NULL chars in pattern matching. + # Truncate name field at the first NULL + $intrcfg->{name} =~ s/\0.*$//; + + if ($intrcfg->{snaptime} < $minsnap) { + $minsnap = $intrcfg->{snaptime}; + } elsif ($intrcfg->{snaptime} > $maxsnap) { + $maxsnap = $intrcfg->{snaptime}; + } + + my $cookie = "$intrcfg->{buspath} $intrcfg->{ino}"; + if (exists $stat{$cpu}{ivecs}{$cookie}) { + my $cookiestats = $stat{$cpu}{ivecs}{$cookie}; + + $cookiestats->{time} += $intrcfg->{time}; + $cookiestats->{name} .= "/$intrcfg->{name}"; + + # If this new interrupt sharing $cookie represents a + # change from an earlier getstat, make sure that + # generate_delta will see the change by setting + # crtime to the most recent crtime of its components. + + if ($intrcfg->{crtime} > $cookiestats->{crtime}) { + $cookiestats->{crtime} = $intrcfg->{crtime}; + } + $cookiestats->{ihs}++; + next; + } + $stat{$cpu}{ivecs}{$cookie}{time} = $intrcfg->{time}; + $stat{$cpu}{ivecs}{$cookie}{crtime} = $intrcfg->{crtime}; + $stat{$cpu}{ivecs}{$cookie}{pil} = $intrcfg->{pil}; + $stat{$cpu}{ivecs}{$cookie}{ino} = $intrcfg->{ino}; + $stat{$cpu}{ivecs}{$cookie}{num_ino} = 1; + $stat{$cpu}{ivecs}{$cookie}{buspath} = $intrcfg->{buspath}; + $stat{$cpu}{ivecs}{$cookie}{name} = $intrcfg->{name}; + $stat{$cpu}{ivecs}{$cookie}{ihs} = 1; + + if ($pcplusmp_sys && ($intrcfg->{type} =~ /^msi\0/)) { + if (!(exists($msidevs{$intrcfg->{name}}))) { + $msidevs{$intrcfg->{name}} = {}; + } + $msidevs{$intrcfg->{name}}{$intrcfg->{ino}} = + \$stat{$cpu}{ivecs}{$cookie}; + } + } + + # All MSI interrupts of a device instance share a single MSI address. + # On X86 systems with an APIC, this MSI address is interpreted as CPU + # routing info by the APIC. For this reason, on these platforms, all + # interrupts for MSI devices must be moved to the same CPU at the same + # time. + # + # Since all interrupts will be on the same CPU on these platforms, all + # interrupts can be consolidated into one ivec entry. For such devices, + # num_ino will be > 1 to denote that a group move is needed. + + # Loop thru all MSI devices on X86 pcplusmp systems. + # Nop on other systems. + foreach my $msidevkey (sort keys %msidevs) { + + # Loop thru inos of the device, sorted by lowest value first + # For each cookie found for a device, incr num_ino for the + # lowest cookie and remove other cookies. + + # Assumes PIL is the same for first and current cookies + + my $first_ino = -1; + my $first_cookiep; + my $curr_cookiep; + foreach my $inokey (sort keys %{$msidevs{$msidevkey}}) { + $curr_cookiep = $msidevs{$msidevkey}{$inokey}; + if ($first_ino == -1) { + $first_ino = $inokey; + $first_cookiep = $curr_cookiep; + } else { + $$first_cookiep->{num_ino}++; + $$first_cookiep->{time} += + $$curr_cookiep->{time}; + if ($$curr_cookiep->{crtime} > + $$first_cookiep->{crtime}) { + $$first_cookiep->{crtime} = + $$curr_cookiep->{crtime}; + } + # Invalidate this cookie, less complicated and + # more efficient than deleting it. + $$curr_cookiep->{num_ino} = 0; + } + } + } + + # We define the timerange as the amount of time spent gathering the + # various kstats, divided by our sleeptime. If we take a lot of time + # to access the kstats, and then we create a delta comparing these + # kstats with a prior set of kstats, that delta will cover + # substaintially different amount of time depending upon which + # interrupt or CPU is being examined. + # + # By checking the timerange here, we guarantee that any deltas + # created from these kstats will contain self-consistent data, + # in that all CPUs and interrupts cover a similar span of time. + # + # $timerange_toohi is the upper bound. Any timerange above + # this is thrown out as garbage. If the stat is safely within this + # bound, we treat the stat as representing an instant in time, rather + # than the time range it actually spans. We arbitrarily choose minsnap + # as the snaptime of the stat. + + $stat{snaptime} = $minsnap; + my $timerange = ($maxsnap - $minsnap) / $sleeptime; + return (0) if ($timerange > $timerange_toohi); # i.e. failure + return (\%stat); +} + +# +# dumpdelta takes a reference to our "delta" structure: +# {"missing"} "1" if the delta's component stats had inconsistencies +# {"minsnap"} time of the first kstat snaptime used in this delta +# {"maxsnap"} time of the last kstat snaptime used in this delta +# {"goodness"} cost function applied to this delta +# {"avgintrload"} avg of interrupt load across cpus, as a percentage +# {"avgintrnsec"} avg number of nsec spent in interrupts, per cpu +# {} iterates over on-line cpus +# ->{"intrs"} cpu's movable intr time (sum of "time" for each ivec) +# ->{"tot"} CPU load from all sources in nsec +# ->{"bigintr"} largest value of {ivecs}{}{time} from below +# ->{"intrload"} intrs / tot +# ->{"ivecs"} +# ->{} iterates over ivecs for this cpu +# ->{"time"} time used by this interrupt (in nsec) +# ->{"pil"} pil level of this interrupt +# ->{"ino"} interrupt number (or base vector if MSI group) +# ->{"buspath"} filename of the directory of the device's bus +# ->{"name"} device name +# ->{"ihs"} number of different handlers sharing this ino +# ->{"num_ino"} number of interrupt vectors in MSI group +# +# It prints out the delta structure in a nice, human readable display. +# + +sub dumpdelta($) +{ + my ($delta) = @_; + + # print global info + + syslog('debug', "dumpdelta:"); + syslog('debug', " RECONFIGURATION IN DELTA") if $delta->{missing} > 0; + syslog('debug', " avgintrload: %5.2f%% avgintrnsec: %d", + $delta->{avgintrload} * 100, $delta->{avgintrnsec}); + syslog('debug', " goodness: %5.2f%%", $delta->{goodness} * 100) + if exists($delta->{goodness}); + + # iterate over cpus + + while (my ($cpu, $cpst) = each %$delta) { + next if !ref($cpst); # skip non-cpuid entries + my $tot = $cpst->{tot}; + syslog('debug', " cpu %3d intr %7.3f%% (bigintr %7.3f%%)", + $cpu, $cpst->{intrload}*100, $cpst->{bigintr}*100/$tot); + syslog('debug', " intrs %d, bigintr %d", + $cpst->{intrs}, $cpst->{bigintr}); + + # iterate over ivecs on this cpu + + while (my ($ivec, $ivst) = each %{$cpst->{ivecs}}) { + syslog('debug', " %15s:\"%s\": %7.3f%% %d", + ($ivst->{ihs} > 1 ? "$ivst->{name}($ivst->{ihs})" : + $ivst->{name}), $ivec, + $ivst->{time}*100 / $tot, $ivst->{time}); + } + } +} + +# +# generate_delta($stat, $newstat) takes two stat references, returned from +# getstat(), and creates a %delta. %delta (not surprisingly) contains the +# same basic info as stat and newstat, but with the timestamps as deltas +# instead of absolute times. We return a reference to the delta. +# + +sub generate_delta($$) +{ + my ($stat, $newstat) = @_; + + my %delta = (); + my $intrload; + my $intrnsec; + my $cpus; + + # Take the worstcase timerange + $delta{minsnap} = $stat->{snaptime}; + $delta{maxsnap} = $newstat->{snaptime}; + if (VERIFY($delta{maxsnap} > $delta{minsnap}, + "generate_delta: stats aren't ascending")) { + $delta{missing} = 1; + return (\%delta); + } + + # if there are a different number of cpus in the stats, set missing + + $delta{missing} = (keys(%$stat) != keys(%$newstat)); + if (VERIFY($delta{missing} == 0, + "generate_delta: number of CPUs changed")) { + return (\%delta); + } + + # scan through every cpu in %newstat and compare against %stat + + while (my ($cpu, $newcpst) = each %$newstat) { + next if !ref($newcpst); # skip non-cpuid fields + + # If %stat is missing a cpu from %newstat, then it was just + # onlined. Mark missing. + + if (VERIFY(exists $stat->{$cpu} && + $stat->{$cpu}{crtime} == $newcpst->{crtime}, + "generate_delta: cpu $cpu changed")) { + $delta{missing} = 1; + return (\%delta); + } + my $cpst = $stat->{$cpu}; + $delta{$cpu}{tot} = $newcpst->{tot} - $cpst->{tot}; + if (VERIFY($delta{$cpu}{tot} >= 0, + "generate_delta: deltas are not ascending?")) { + $delta{missing} = 1; + delete($delta{$cpu}); + return (\%delta); + } + # Avoid remote chance of division by zero + $delta{$cpu}{tot} = 1 if $delta{$cpu}{tot} == 0; + $delta{$cpu}{intrs} = 0; + $delta{$cpu}{bigintr} = 0; + + my %ivecs = (); + $delta{$cpu}{ivecs} = \%ivecs; + + # if the number of ivecs differs, set missing + + if (VERIFY(keys(%{$cpst->{ivecs}}) == + keys(%{$newcpst->{ivecs}}), + "generate_delta: cpu $cpu has more/less". + " interrupts")) { + $delta{missing} = 1; + return (\%delta); + } + + while (my ($inum, $newivec) = each %{$newcpst->{ivecs}}) { + + # Unused cookie, corresponding to an MSI vector which + # is part of a group. The whole group is accounted for + # by a different cookie. + next if ($newivec->{num_ino} == 0); + + # If this ivec doesn't exist in $stat, or if $stat + # shows a different crtime, set missing. + if (VERIFY(exists $cpst->{ivecs}{$inum} && + $cpst->{ivecs}{$inum}{crtime} == + $newivec->{crtime}, + "generate_delta: cpu $cpu inum $inum". + " has changed")) { + $delta{missing} = 1; + return (\%delta); + } + my $ivec = $cpst->{ivecs}{$inum}; + + # Create $delta{$cpu}{ivecs}{$inum}. + + my %dltivec = (); + $delta{$cpu}{ivecs}{$inum} = \%dltivec; + + # calculate time used by this interrupt + + my $time = $newivec->{time} - $ivec->{time}; + if (VERIFY($time >= 0, + "generate_delta: ivec went backwards?")) { + $delta{missing} = 1; + delete($delta{$cpu}{ivecs}{$inum}); + return (\%delta); + } + $delta{$cpu}{intrs} += $time; + $dltivec{time} = $time; + if ($time > $delta{$cpu}{bigintr}) { + $delta{$cpu}{bigintr} = $time; + } + + # Transfer over basic info about the kstat. We + # don't have to worry about discrepancies between + # ivec and newivec because we verified that both + # have the same crtime. + + $dltivec{pil} = $newivec->{pil}; + $dltivec{ino} = $newivec->{ino}; + $dltivec{buspath} = $newivec->{buspath}; + $dltivec{name} = $newivec->{name}; + $dltivec{ihs} = $newivec->{ihs}; + $dltivec{num_ino} = $newivec->{num_ino}; + } + if ($delta{$cpu}{tot} < $delta{$cpu}{intrs}) { + # Ewww! Hopefully just a rounding error. + # Make something up. + $delta{$cpu}{tot} = $delta{$cpu}{intrs}; + } + $delta{$cpu}{intrload} = + $delta{$cpu}{intrs} / $delta{$cpu}{tot}; + $intrload += $delta{$cpu}{intrload}; + $intrnsec += $delta{$cpu}{intrs}; + $cpus++; + } + if ($cpus > 0) { + $delta{avgintrload} = $intrload / $cpus; + $delta{avgintrnsec} = $intrnsec / $cpus; + } else { + $delta{avgintrload} = 0; + $delta{avgintrnsec} = 0; + } + return (\%delta); +} + + +# compress_delta takes a list of deltas, and returns a single new delta +# which represents the combined information from all the deltas. The deltas +# provided are assumed to be sequential in time. The resulting compressed +# delta looks just like any other delta. This new delta is also more accurate +# since its statistics are averaged over a longer period than any of the +# original deltas. + +sub compress_deltas ($) +{ + my ($deltas) = @_; + + my %newdelta = (); + my ($intrs, $tot); + my $cpus = 0; + my ($high_intrload) = 0; + + if (VERIFY($#$deltas != -1, + "compress_deltas: list of delta is empty?")) { + return (0); + } + $newdelta{minsnap} = $deltas->[0]{minsnap}; + $newdelta{maxsnap} = $deltas->[$#$deltas]{maxsnap}; + $newdelta{missing} = 0; + + foreach my $delta (@$deltas) { + if (VERIFY($delta->{missing} == 0, + "compressing bad deltas?")) { + return (0); + } + while (my ($cpuid, $cpu) = each %$delta) { + next if !ref($cpu); + + $intrs += $cpu->{intrs}; + $tot += $cpu->{tot}; + $newdelta{$cpuid}{intrs} += $cpu->{intrs}; + $newdelta{$cpuid}{tot} += $cpu->{tot}; + if (!exists $newdelta{$cpuid}{ivecs}) { + my %ivecs = (); + $newdelta{$cpuid}{ivecs} = \%ivecs; + } + while (my ($inum, $ivec) = each %{$cpu->{ivecs}}) { + my $newivecs = $newdelta{$cpuid}{ivecs}; + $newivecs->{$inum}{time} += $ivec->{time}; + $newivecs->{$inum}{pil} = $ivec->{pil}; + $newivecs->{$inum}{ino} = $ivec->{ino}; + $newivecs->{$inum}{buspath} = $ivec->{buspath}; + $newivecs->{$inum}{name} = $ivec->{name}; + $newivecs->{$inum}{ihs} = $ivec->{ihs}; + $newivecs->{$inum}{num_ino} = $ivec->{num_ino}; + } + } + } + foreach my $cpu (values(%newdelta)) { + next if !ref($cpu); # ignore non-cpu fields + $cpus++; + + my $bigintr = 0; + foreach my $ivec (values(%{$cpu->{ivecs}})) { + if ($ivec->{time} > $bigintr) { + $bigintr = $ivec->{time}; + } + } + $cpu->{bigintr} = $bigintr; + $cpu->{intrload} = $cpu->{intrs} / $cpu->{tot}; + if ($high_intrload < $cpu->{intrload}) { + $high_intrload = $cpu->{intrload}; + } + $cpu->{tot} = 1 if $cpu->{tot} <= 0; + } + if ($cpus == 0) { + $newdelta{avgintrnsec} = 0; + $newdelta{avgintrload} = 0; + } else { + $newdelta{avgintrnsec} = $intrs / $cpus; + $newdelta{avgintrload} = $intrs / $tot; + } + $sleeptime = ($high_intrload < $idle_intrload) ? $idle_sleeptime : + $normal_sleeptime; + return (\%newdelta); +} + + + + + +# What follow are the core functions responsible for examining the deltas +# generated above and deciding what to do about them. +# +# goodness() and its helper goodness_cpu() return a heuristic which describe +# how good (or bad) the current interrupt balance is. The value returned will +# be between 0 and 1, with 0 representing maximum goodness, and 1 representing +# maximum badness. +# +# imbalanced() compares a current and historical value of goodness, and +# determines if there has been enough change to warrant evaluating a +# reconfiguration of the interrupts +# +# do_reconfig(), and its helpers, do_reconfig_cpu(), do_reconfig_cpu2cpu(), +# find_goal(), do_find_goal(), and move_intr(), are responsible for examining +# a delta and determining the best possible assignment of interrupts to CPUs. +# +# It is important that do_reconfig() be in alignment with goodness(). If +# do_reconfig were to generate a new interrupt distribution that worsened +# goodness, we could get into a pathological loop with intrd fighting itself, +# constantly deciding that things are imbalanced, and then changing things +# only to make them worse. + + + +# any goodness over $goodness_unsafe_load is considered really bad +# goodness must drop by at least $goodness_mindelta for a reconfig + +my $goodness_unsafe_load = .9; +my $goodness_mindelta = .1; + +# goodness(%delta) examines a delta and return its "goodness". goodness will +# be between 0 (best) and 1 (major bad). goodness is determined by evaluating +# the goodness of each individual cpu, and returning the worst case. This +# helps on systems with many CPUs, where otherwise a single pathological CPU +# might otherwise be ignored because the average was OK. +# +# To calculate the goodness of an individual CPU, we start by looking at its +# load due to interrupts. If the load is above a certain high threshold and +# there is more than one interrupt assigned to this CPU, we set goodness +# to worst-case. If the load is below the average interrupt load of all CPUs, +# then we return best-case, since what's to complain about? +# +# Otherwise we look at how much the load is above the average, and return +# that as the goodness, with one caveat: we never return more than the CPU's +# interrupt load ignoring its largest single interrupt source. This is +# because a CPU with one high-load interrupt, and no other interrupts, is +# perfectly balanced. Nothing can be done to improve the situation, and thus +# it is perfectly balanced even if the interrupt's load is 100%. + +sub goodness($) +{ + my ($delta) = @_; + + return (1) if $delta->{missing} > 0; + + my $high_goodness = 0; + my $goodness; + + foreach my $cpu (values(%$delta)) { + next if !ref($cpu); # skip non-cpuid fields + + $goodness = goodness_cpu($cpu, $delta->{avgintrload}); + if (VERIFY($goodness >= 0 && $goodness <= 1, + "goodness: cpu goodness out of range?")) { + dumpdelta($delta); + return (1); + } + if ($goodness == 1) { + return (1); # worst case, no need to continue + } + if ($goodness > $high_goodness) { + $high_goodness = $goodness; + } + } + return ($high_goodness); +} + +sub goodness_cpu($$) # private function +{ + my ($cpu, $avgintrload) = @_; + + my $goodness; + my $load = $cpu->{intrs} / $cpu->{tot}; + + return (0) if ($load < $avgintrload); # low loads are perfectly good + + # Calculate $load_no_bigintr, which represents the load + # due to interrupts, excluding the one biggest interrupt. + # This is the most gain we can get on this CPU from + # offloading interrupts. + + my $load_no_bigintr = ($cpu->{intrs} - $cpu->{bigintr}) / $cpu->{tot}; + + # A major imbalance is indicated if a CPU is saturated + # with interrupt handling, and it has more than one + # source of interrupts. Those other interrupts could be + # starved if of a lower pil. Return a goodness of 1, + # which is the worst possible return value, + # which will effectively contaminate this entire delta. + + my $cnt = keys(%{$cpu->{ivecs}}); + + if ($load > $goodness_unsafe_load && $cnt > 1) { + return (1); + } + $goodness = $load - $avgintrload; + if ($goodness > $load_no_bigintr) { + $goodness = $load_no_bigintr; + } + return ($goodness); +} + + +# imbalanced() is used by the main routine to determine if the goodness +# has shifted far enough from our last baseline to warrant a reassignment +# of interrupts. A very high goodness indicates that a CPU is way out of +# whack. If the goodness has varied too much since the baseline, then +# perhaps a reconfiguration is worth considering. + +sub imbalanced ($$) +{ + my ($goodness, $baseline) = @_; + + # Return 1 if we are pathological, or creeping away from the baseline + + return (1) if $goodness > .50; + return (1) if abs($goodness - $baseline) > $goodness_mindelta; + return (0); +} + +# do_reconfig(), do_reconfig_cpu(), and do_reconfig_cpu2cpu(), are the +# decision-making functions responsible for generating a new interrupt +# distribution. They are designed with the definition of goodness() in +# mind, i.e. they use the same definition of "good distribution" as does +# goodness(). +# +# do_reconfig() is responsible for deciding whether a redistribution is +# actually warranted. If the goodness is already pretty good, it doesn't +# waste the CPU time to generate a new distribution. If it +# calculates a new distribution and finds that it is not sufficiently +# improved from the prior distirbution, it will not do the redistribution, +# mainly to avoid the disruption to system performance caused by +# rejuggling interrupts. +# +# Its main loop works by going through a list of cpus sorted from +# highest to lowest interrupt load. It removes the highest-load cpus +# one at a time and hands them off to do_reconfig_cpu(). This function +# then re-sorts the remaining CPUs from lowest to highest interrupt load, +# and one at a time attempts to rejuggle interrupts between the original +# high-load CPU and the low-load CPU. Rejuggling on a high-load CPU is +# considered finished as soon as its interrupt load is within +# $goodness_mindelta of the average interrupt load. Such a CPU will have +# a goodness of below the $goodness_mindelta threshold. + +# +# move_intr(\%delta, $inum, $oldcpu, $newcpu) +# used by reconfiguration code to move an interrupt between cpus within +# a delta. This manipulates data structures, and does not actually move +# the interrupt on the running system. +# +sub move_intr($$$$) # private function +{ + my ($delta, $inum, $oldcpuid, $newcpuid) = @_; + + my $ivec = $delta->{$oldcpuid}{ivecs}{$inum}; + + # Remove ivec from old cpu + + my $oldcpu = $delta->{$oldcpuid}; + $oldcpu->{intrs} -= $ivec->{time}; + $oldcpu->{intrload} = $oldcpu->{intrs} / $oldcpu->{tot}; + delete($oldcpu->{ivecs}{$inum}); + + VERIFY($oldcpu->{intrs} >= 0, "move_intr: intr's time > total time?"); + VERIFY($ivec->{time} <= $oldcpu->{bigintr}, + "move_intr: intr's time > bigintr?"); + + if ($ivec->{time} >= $oldcpu->{bigintr}) { + my $bigtime = 0; + + foreach my $ivec (values(%{$oldcpu->{ivecs}})) { + $bigtime = $ivec->{time} if $ivec->{time} > $bigtime; + } + $oldcpu->{bigintr} = $bigtime; + } + + # Add ivec onto new cpu + + my $newcpu = $delta->{$newcpuid}; + + $ivec->{nowcpu} = $newcpuid; + $newcpu->{intrs} += $ivec->{time}; + $newcpu->{intrload} = $newcpu->{intrs} / $newcpu->{tot}; + $newcpu->{ivecs}{$inum} = $ivec; + + $newcpu->{bigintr} = $ivec->{time} + if $ivec->{time} > $newcpu->{bigintr}; +} + +sub move_intr_check($$$) # private function +{ + my ($delta, $oldcpuid, $newcpuid) = @_; + + VERIFY($delta->{$oldcpuid}{tot} >= $delta->{$oldcpuid}{intrs}, + "Moved interrupts left 100+%% load on src cpu"); + VERIFY($delta->{$newcpuid}{tot} >= $delta->{$newcpuid}{intrs}, + "Moved interrupts left 100+%% load on tgt cpu"); +} + +sub ivecs_to_string(@) # private function +{ + my $str = ""; + foreach my $ivec (@_) { + $str = "$str $ivec->{inum}"; + } + return ($str); +} + + +sub do_reconfig($) +{ + my ($delta) = @_; + + my $goodness = $delta->{goodness}; + + # We can't improve goodness to better than 0. We should stop here + # if, even if we achieve a goodness of 0, the improvement is still + # too small to merit the action. + + if ($goodness - 0 < $goodness_mindelta) { + syslog('debug', "goodness good enough, don't reconfig"); + return (0); + } + + syslog('notice', "Optimizing interrupt assignments"); + + if (VERIFY ($delta->{missing} == 0, "RECONFIG Aborted: should not ". + "have a delta with missing")) { + return (-1); + } + + # Make a list of all cpuids, and also add some extra information + # to the ivec structures. + + my @cpusortlist = (); + + while (my ($cpuid, $cpu) = each %$delta) { + next if !ref($cpu); # skip non-cpu entries + + push(@cpusortlist, $cpuid); + while (my ($inum, $ivec) = each %{$cpu->{ivecs}}) { + $ivec->{origcpu} = $cpuid; + $ivec->{nowcpu} = $cpuid; + $ivec->{inum} = $inum; + } + } + + # Sort the list of CPUs from highest to lowest interrupt load. + # Remove the top CPU from that list and attempt to redistribute + # its interrupts. If the CPU has a goodness below a threshold, + # just ignore the CPU and move to the next one. If the CPU's + # load falls below the average load plus that same threshold, + # then there are no CPUs left worth reconfiguring, and we're done. + + while (@cpusortlist) { + # Re-sort cpusortlist each time, since do_reconfig_cpu can + # move interrupts around. + + @cpusortlist = + sort({$delta->{$b}{intrload} <=> $delta->{$a}{intrload}} + @cpusortlist); + + my $cpu = shift(@cpusortlist); + if (($delta->{$cpu}{intrload} <= $goodness_unsafe_load) && + ($delta->{$cpu}{intrload} <= + $delta->{avgintrload} + $goodness_mindelta)) { + syslog('debug', "finished reconfig: cpu $cpu load ". + "$delta->{$cpu}{intrload} avgload ". + "$delta->{avgintrload}"); + last; + } + if (goodness_cpu($delta->{$cpu}, $delta->{avgintrload}) < + $goodness_mindelta) { + next; + } + do_reconfig_cpu($delta, \@cpusortlist, $cpu); + } + + # How good a job did we do? If the improvement was minimal, and + # our goodness wasn't pathological (and thus needing any help it + # can get), then don't bother moving the interrupts. + + my $newgoodness = goodness($delta); + VERIFY($newgoodness <= $goodness, + "reconfig: result has worse goodness?"); + + if (($goodness != 1 || $newgoodness == 1) && + $goodness - $newgoodness < $goodness_mindelta) { + syslog('debug', "goodness already near optimum, ". + "don't reconfig"); + return (0); + } + syslog('debug', "goodness %5.2f%% --> %5.2f%%", $goodness*100, + $newgoodness*100); + + # Time to move those interrupts! + + my $ret = 1; + my $warned = 0; + while (my ($cpuid, $cpu) = each %$delta) { + next if $cpuid =~ /\D/; + while (my ($inum, $ivec) = each %{$cpu->{ivecs}}) { + next if ($ivec->{origcpu} == $cpuid); + + if (!intrmove($ivec->{buspath}, $ivec->{ino}, + $cpuid, $ivec->{num_ino})) { + syslog('warning', "Unable to move interrupts") + if $warned++ == 0; + syslog('debug', "Unable to move buspath ". + "$ivec->{buspath} ino $ivec->{ino} to ". + "cpu $cpuid"); + $ret = -1; + } + } + } + + syslog('notice', "Interrupt assignments optimized"); + return ($ret); +} + +sub do_reconfig_cpu($$$) # private function +{ + my ($delta, $cpusortlist, $oldcpuid) = @_; + + # We have been asked to rejuggle interrupts between $oldcpuid and + # other CPUs found on $cpusortlist so as to improve the load on + # $oldcpuid. We reverse $cpusortlist to get our own copy of the + # list, sorted from lowest to highest interrupt load. One at a + # time, shift a CPU off of this list of CPUs, and attempt to + # rejuggle interrupts between the two CPUs. Don't do this if the + # other CPU has a higher load than oldcpuid. We're done rejuggling + # once $oldcpuid's goodness falls below a threshold. + + syslog('debug', "reconfiguring $oldcpuid"); + + my $cpu = $delta->{$oldcpuid}; + my $avgintrload = $delta->{avgintrload}; + + my @cputargetlist = reverse(@$cpusortlist); # make a copy of the list + while ($#cputargetlist != -1) { + last if goodness_cpu($cpu, $avgintrload) < $goodness_mindelta; + + my $tgtcpuid = shift(@cputargetlist); + my $tgt = $delta->{$tgtcpuid}; + my $load = $cpu->{intrload}; + my $tgtload = $tgt->{intrload}; + last if $tgtload > $load; + do_reconfig_cpu2cpu($delta, $oldcpuid, $tgtcpuid, $load); + } +} + +sub do_reconfig_cpu2cpu($$$$) # private function +{ + my ($delta, $srccpuid, $tgtcpuid, $srcload) = @_; + + # We've been asked to consider interrupt juggling between srccpuid + # (with a high interrupt load) and tgtcpuid (with a lower interrupt + # load). First, make a single list with all of the ivecs from both + # CPUs, and sort the list from highest to lowest load. + + syslog('debug', "exchanging intrs between $srccpuid and $tgtcpuid"); + + # Gather together all the ivecs and sort by load + + my @ivecs = (values(%{$delta->{$srccpuid}{ivecs}}), + values(%{$delta->{$tgtcpuid}{ivecs}})); + return if $#ivecs == -1; + + @ivecs = sort({$b->{time} <=> $a->{time}} @ivecs); + + # Our "goal" load for srccpuid is the average load across all CPUs. + # find_goal() will find determine the optimum selection of the + # available interrupts which comes closest to this goal without + # falling below the goal. + + my $goal = $delta->{avgintrnsec}; + + # We know that the interrupt load on tgtcpuid is less than that on + # srccpuid, but its load could still be above avgintrnsec. Don't + # choose a goal which would bring srccpuid below the load on tgtcpuid. + + my $avgnsec = + ($delta->{$srccpuid}{intrs} + $delta->{$tgtcpuid}{intrs}) / 2; + if ($goal < $avgnsec) { + $goal = $avgnsec; + } + + # If the largest of the interrupts is on srccpuid, leave it there. + # This can help minimize the disruption caused by moving interrupts. + + if ($ivecs[0]->{origcpu} == $srccpuid) { + syslog('debug', "Keeping $ivecs[0]->{inum} on $srccpuid"); + $goal -= $ivecs[0]->{time}; + shift(@ivecs); + } + + syslog('debug', "GOAL: inums should total $goal"); + find_goal(\@ivecs, $goal); + + # find_goal() returned its results to us by setting $ivec->{goal} if + # the ivec should be on srccpuid, or clearing it for tgtcpuid. + # Call move_intr() to update our $delta with the new results. + + foreach my $ivec (@ivecs) { + syslog('debug', "ivec $ivec->{inum} goal $ivec->{goal}"); + VERIFY($ivec->{nowcpu} == $srccpuid || + $ivec->{nowcpu} == $tgtcpuid, "cpu2cpu found an ". + "interrupt not currently on src or tgt cpu"); + + if ($ivec->{goal} && $ivec->{nowcpu} != $srccpuid) { + move_intr($delta, $ivec->{inum}, $ivec->{nowcpu}, + $srccpuid); + } elsif ($ivec->{goal} == 0 && $ivec->{nowcpu} != $tgtcpuid) { + move_intr($delta, $ivec->{inum}, $ivec->{nowcpu}, + $tgtcpuid); + } + } + move_intr_check($delta, $srccpuid, $tgtcpuid); # asserts + + my $newload = $delta->{$srccpuid}{intrs} / $delta->{$srccpuid}{tot}; + VERIFY($newload <= $srcload && $newload > $delta->{avgintrload}, + "cpu2cpu: new load didn't end up in expected range"); +} + + +# find_goal() and its helper do_find_goal() are used to find the best +# combination of interrupts in order to generate a load that is as close +# as possible to a goal load without falling below that goal. Before returning +# to its caller, find_goal() sets a new value in the hash of each interrupt, +# {goal}, which if set signifies that this interrupt is one of the interrupts +# identified as part of the set of interrupts which best meet the goal. +# +# The arguments to find_goal are a list of ivecs (hash references), sorted +# by descending {time}, and the goal load. The goal is relative to {time}. +# The best fit is determined by performing a depth-first search. do_find_goal +# is the recursive subroutine which carries out the search. +# +# It is passed an index as an argument, originally 0. On a given invocation, +# it is only to consider interrupts in the ivecs array starting at that index. +# It then considers two possibilities: +# 1) What is the best goal-fit if I include ivecs[index]? +# 2) What is the best goal-fit if I exclude ivecs[index]? +# To determine case 1, it subtracts the load of ivecs[index] from the goal, +# and calls itself recursively with that new goal and index++. +# To determine case 2, it calls itself recursively with the same goal and +# index++. +# +# It then compares the two results, decide which one best meets the goals, +# and returns the result. The return value is the best-fit's interrupt load, +# followed by a list of all the interrupts which make up that best-fit. +# +# As an optimization, a second array loads[] is created which mirrors ivecs[]. +# loads[i] will equal the total loads of all ivecs[i..$#ivecs]. This is used +# by do_find_goal to avoid recursing all the way to the end of the ivecs +# array if including all remaining interrupts will still leave the best-fit +# at below goal load. If so, it then includes all remaining interrupts on +# the goal list and returns. +# +sub find_goal($$) # private function +{ + my ($ivecs, $goal) = @_; + + my @goals; + my $load; + my $ivec; + + if ($goal <= 0) { + @goals = (); # the empty set will best meet the goal + } else { + syslog('debug', "finding goal from intrs %s", + ivecs_to_string(@$ivecs)); + + # Generate @loads array + + my $tot = 0; + foreach $ivec (@$ivecs) { + $tot += $ivec->{time}; + } + my @loads = (); + foreach $ivec (@$ivecs) { + push(@loads, $tot); + $tot -= $ivec->{time}; + } + ($load, @goals) = do_find_goal($ivecs, \@loads, $goal, 0); + VERIFY($load >= $goal, "find_goal didn't meet goals"); + } + syslog('debug', "goals found: %s", ivecs_to_string(@goals)); + + # Set or clear $ivec->{goal} for each ivec, based on returned @goals + + foreach $ivec (@$ivecs) { + if ($#goals > -1 && $ivec == $goals[0]) { + syslog('debug', "inum $ivec->{inum} on source cpu"); + $ivec->{goal} = 1; + shift(@goals); + } else { + syslog('debug', "inum $ivec->{inum} on target cpu"); + $ivec->{goal} = 0; + } + } +} + + +sub do_find_goal($$$$) # private function +{ + my ($ivecs, $loads, $goal, $idx) = @_; + + if ($idx > $#{$ivecs}) { + return (0); + } + syslog('debug', "$idx: finding goal $goal inum $ivecs->[$idx]{inum}"); + + my $load = $ivecs->[$idx]{time}; + my @goals_with = (); + my @goals_without = (); + my ($with, $without); + + # If we include all remaining items and we're still below goal, + # stop here. We can just return a result that includes $idx and all + # subsequent ivecs. Since this will still be below goal, there's + # nothing better to be done. + + if ($loads->[$idx] <= $goal) { + syslog('debug', + "$idx: including all remaining intrs %s with load %d", + ivecs_to_string(@$ivecs[$idx .. $#{$ivecs}]), + $loads->[$idx]); + return ($loads->[$idx], @$ivecs[$idx .. $#{$ivecs}]); + } + + # Evaluate the "with" option, i.e. the best matching goal which + # includes $ivecs->[$idx]. If idx's load is more than our goal load, + # stop here. Once we're above the goal, there is no need to consider + # further interrupts since they'll only take us further from the goal. + + if ($goal <= $load) { + $with = $load; # stop here + } else { + ($with, @goals_with) = + do_find_goal($ivecs, $loads, $goal - $load, $idx + 1); + $with += $load; + } + syslog('debug', "$idx: with-load $with intrs %s", + ivecs_to_string($ivecs->[$idx], @goals_with)); + + # Evaluate the "without" option, i.e. the best matching goal which + # excludes $ivecs->[$idx]. + + ($without, @goals_without) = + &do_find_goal($ivecs, $loads, $goal, $idx + 1); + syslog('debug', "$idx: without-load $without intrs %s", + ivecs_to_string(@goals_without)); + + # We now have our "with" and "without" options, and we choose which + # best fits the goal. If one is greater than goal and the other is + # below goal, we choose the one that is greater. If they are both + # below goal, then we choose the one that is greater. If they are + # both above goal, then we choose the smaller. + + my $which; # 0 == with, 1 == without + if ($with >= $goal && $without < $goal) { + $which = 0; + } elsif ($with < $goal && $without >= $goal) { + $which = 1; + } elsif ($with >= $goal && $without >= $goal) { + $which = ($without < $with); + } else { + $which = ($without > $with); + } + + # Return the load of our best case scenario, followed by all the ivecs + # which compose that goal. + + if ($which == 1) { # without + syslog('debug', "$idx: going without"); + return ($without, @goals_without); + } else { + syslog('debug', "$idx: going with"); + return ($with, $ivecs->[$idx], @goals_with); + } + # Not reached +} + + + + +syslog('debug', "intrd is starting".($debug ? " (debug)" : "")); + +my @deltas = (); +my $deltas_tottime = 0; # sum of maxsnap-minsnap across @deltas +my $avggoodness; +my $baseline_goodness = 0; +my $compdelta; + +my $do_reconfig; + +# temp variables +my $goodness; +my $deltatime; +my $olddelta; +my $olddeltatime; +my $delta; +my $newstat; +my $below_statslen; +my $newtime; +my $ret; + + +my $gotsig = 0; +$SIG{INT} = sub { $gotsig = 1; }; # don't die in the middle of retargeting +$SIG{HUP} = $SIG{INT}; +$SIG{TERM} = $SIG{INT}; + +my $ks; +if ($using_scengen == 0) { + $ks = Sun::Solaris::Kstat->new(); +} else { + $ks = myks_update(); # supplied by the simulator +} + +# If no pci_intrs kstats were found, we need to exit, but we can't because +# SMF will restart us and/or report an error to the administrator. But +# there's nothing an administrator can do. So print out a message for SMF +# logs and silently pause forever. + +if (!exists($ks->{pci_intrs})) { + print STDERR "$cmdname: no interrupts were found; ". + "your PCI bus may not yet be supported\n"; + pause() while $gotsig == 0; + exit 0; +} + +# See if this is a system with a pcplusmp APIC. +# Such systems will get special handling. +# Assume that if one bus has a pcplusmp APIC that they all do. + +# Get a list of pci_intrs kstats. +my @elem = values(%{$ks->{pci_intrs}}); +my $elem0 = $elem[0]; +my $elemval = (values(%$elem0))[0]; + +# Use its buspath to query the system. It is assumed that either all or none +# of the busses on a system are hosted by the pcplusmp APIC. +my $pcplusmp_sys = is_pcplusmp($elemval->{buspath}); + +my $stat = getstat($ks, $pcplusmp_sys); + +for (;;) { + sub clear_deltas { + @deltas = (); + $deltas_tottime = 0; + $stat = 0; # prevent next gen_delta() from setting {missing} + } + + # 1. Sleep, update the kstats, and save the new stats in $newstat. + + exit 0 if $gotsig; # if we got ^C / SIGTERM, exit + if ($using_scengen == 0) { + sleep($sleeptime); + exit 0 if $gotsig; # if we got ^C / SIGTERM, exit + $ks->update(); + } else { + $ks = myks_update(); + } + $newstat = getstat($ks, $pcplusmp_sys); + + # $stat or $newstat could be zero if they're uninitialized, or if + # getstat() failed. If $stat is zero, move $newstat to $stat, sleep + # and try again. If $newstat is zero, then we also sleep and try + # again, hoping the problem will clear up. + + next if (!ref $newstat); + if (!ref $stat) { + $stat = $newstat; + next; + } + + # 2. Compare $newstat with the prior set of values, result in %$delta. + + $delta = generate_delta($stat, $newstat); + dumpdelta($delta) if $debug; # Dump most recent stats to stdout. + $stat = $newstat; # The new stats now become the old stats. + + + # 3. If $delta->{missing}, then there has been a reconfiguration of + # either cpus or interrupts (probably both). We need to toss out our + # old set of statistics and start from scratch. + # + # Also, if the delta covers a very long range of time, then we've + # been experiencing a system overload that has resulted in intrd + # not being allowed to run effectively for a while now. As above, + # toss our old statistics and start from scratch. + + $deltatime = $delta->{maxsnap} - $delta->{minsnap}; + if ($delta->{missing} > 0 || $deltatime > $statslen) { + clear_deltas(); + syslog('debug', "evaluating interrupt assignments"); + next; + } + + + # 4. Incorporate new delta into the list of deltas, and associated + # statistics. If we've just now received $statslen deltas, then it's + # time to evaluate a reconfiguration. + + $below_statslen = ($deltas_tottime < $statslen); + $deltas_tottime += $deltatime; + $do_reconfig = ($below_statslen && $deltas_tottime >= $statslen); + push(@deltas, $delta); + + # 5. Remove old deltas if total time is more than $statslen. We use + # @deltas as a moving average of the last $statslen seconds. Shift + # off the olders deltas, but only if that doesn't cause us to fall + # below $statslen seconds. + + while (@deltas > 1) { + $olddelta = $deltas[0]; + $olddeltatime = $olddelta->{maxsnap} - $olddelta->{minsnap}; + $newtime = $deltas_tottime - $olddeltatime; + last if ($newtime < $statslen); + + shift(@deltas); + $deltas_tottime = $newtime; + } + + # 6. The brains of the operation are here. First, check if we're + # imbalanced, and if so set $do_reconfig. If $do_reconfig is set, + # either because of imbalance or above in step 4, we evaluate a + # new configuration. + # + # First, take @deltas and generate a single "compressed" delta + # which summarizes them all. Pass that to do_reconfig and see + # what it does with it: + # + # $ret == -1 : failure + # $ret == 0 : current config is optimal (or close enough) + # $ret == 1 : reconfiguration has occurred + # + # If $ret is -1 or 1, dump all our deltas and start from scratch. + # Step 4 above will set do_reconfig soon thereafter. + # + # If $ret is 0, then nothing has happened because we're already + # good enough. Set baseline_goodness to current goodness. + + $compdelta = compress_deltas(\@deltas); + if (VERIFY(ref($compdelta) eq "HASH", "couldn't compress deltas")) { + clear_deltas(); + next; + } + $compdelta->{goodness} = goodness($compdelta); + dumpdelta($compdelta) if $debug; + + $goodness = $compdelta->{goodness}; + syslog('debug', "GOODNESS: %5.2f%%", $goodness * 100); + + if ($deltas_tottime >= $statslen && + imbalanced($goodness, $baseline_goodness)) { + $do_reconfig = 1; + } + + if ($do_reconfig) { + $ret = do_reconfig($compdelta); + + if ($ret != 0) { + clear_deltas(); + syslog('debug', "do_reconfig FAILED!") if $ret == -1; + } else { + syslog('debug', "setting new baseline of $goodness"); + $baseline_goodness = $goodness; + } + } + syslog('debug', "---------------------------------------"); +}