More WIP, need to convert to uu_list.
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#include <stdio.h>
#include <errno.h>
#include <unistd.h>
#include <stdlib.h>
#include <limits.h>
#include <string.h>
#include <getopt.h>
#include <libgen.h>
#include <syslog.h>
#include <kstat.h>
#include <sys/processor.h>
#include <sys/modhash.h>
#include "intrs.h"
/* Interrupt vector info */
typedef struct ivec {
int cookie;
uint64_t time;
hrtime_t crtime;
int pil;
int ino;
int ihs;
int num_ino;
int origcpu;
int nowcpu;
int inum;
} ivec_t;
uu_list_pool_t *ivec_pool;
/* MSI device info */
typedef struct msi_dev {
msi_dev_t *next;
char *devpath[MAXPATHLEN];
int num_intr;
*ivec_t ivecs;
} msi_dev_t;
uu_list_pool_t *msi_dev_pool;
/* Bus info */
typedef struct bus_dev {
bus_dev_t *next;
char *buspath[MAXPATHLEN];
int num_intr;
ivec_t *ivecs;
int is_pcplusmp;
msi_dev_t *msi_head;
} bus_dev_t;
uu_list_pool_t *bus_dev_pool;
/* Per-CPU statistics */
typedef struct cpu_stat {
int state;
uint64_t tot;
hrtime_t crtime;
bus_dev_t *bus_head;
} cpu_stat_t;
uu_list_pool_t *cpu_stat_pool;
/* Interrupt statistics */
type def struct intr_stat {
hrtime_t snaptime;
*cpu_stat_t *cpus;
} intr_stat_t;
uu_list_pool_t *intr_stat_pool;
typedef enum sleeptime {
NORMAL_SLEEPTIME = 10, /* time to sleep between samples */
IDLE_SLEEPTIME = 45, /* time to sleep when idle */
ONECPU_SLEEPTIME = 60 * 15, /* used if only 1 CPU on system */
} sleeptime_t;
int using_scengen; /* 1 if using scenario simulator */
int debug;
int foreground;
int max_cpus;
sleeptime_t sleeptime = NORMAL_SLEEPTIME;
float idle_intrload = 0.1; /* idle if interrupt load < 10% */
float timerange_toohi = 0.1;
int statslen = 60; /* time period (in secs) to keep in @deltas */
int main(int argc, char **argv)
{
const char *cmdname;
kstat_ctl_t *kc;
kstat_t *ksp;
intr_stat_t stat;
char c;
max_cpus = sysconf(_SC_CPUID_MAX) + 1;
cmdname = basename(argv[0]);
/*
* Parse arguments. intrd does not accept any public arguments; the two
* arguments below are meant for testing purposes. -D generates a significant
* amount of syslog output. -S <filename> loads the filename as a perl
* script. That file is expected to implement a kstat "simulator" which
* can be used to feed information to intrd and verify intrd's responses.
*/
while ((c = getopt(argc, argv, "S:Df")) != EOF) {
switch (c) {
case 'S':
using_scengen = 1;
foreground = 1;
load_simulator(optarg);
break;
case 'D':
debug = 1;
break;
case 'f':
foreground = 1;
break;
default:
}
}
if (!foreground) {
if (daemon(0, 0) == -1) {
return 1;
}
}
if (!using_scengen) {
openlog(cmdname, LOG_PID, LOG_DAEMON);
(void) setlogmask(LOG_UPTO(debug ? LOG_DEBUG : LOG_INFO));
}
if (!using_scengen) {
kc = kstat_open();
if (kc == NULL) {
return 1;
}
} else {
/* scengen not implemented */
return 1;
}
/*
* If no pci_intrs kstats were found, we need to exit, but we can't because
* SMF will restart us and/or report an error to the administrator. But
* there's nothing an administrator can do. So print out a message to syslog
* and silently pause forever.
*/
for (ksp = kc->kc_chain; ksp != NULL; ksp = ksp->ks_next) {
if ((ksp->ks_type == KSTAT_TYPE_NAMED) &&
(strcmp(ksp->ks_module, "pci_intrs") == 0)) {
break;
}
}
if (ksp == NULL) {
kstat_close(kc);
syslog(LOG_INFO, "no interrupts were found: " \
"your I/O bus may not yet be supported\n");
do {} while (sleep(ONECPU_SLEEPTIME) == 0);
return 0;
}
stat.cpus = malloc(sizeof (cpu_stat_t) * max_cpus);
if (stat.cpus == NULL) {
return 1;
}
}
static int verify(int condition, const char *msg)
{
int bad = !condition;
if (bad) {
syslog(LOG_DEBUG, "VERIFY: %s", msg);
}
return bad;
}
static void load_simulator(const char *file)
{
}
static int getstat($$);
/*
int generate_delta($$);
int compress_deltas($);
int dumpdelta($);
int goodness($);
int imbalanced($$);
int do_reconfig($);
int goodness_cpu($$); # private function
int move_intr($$$$); # private function
int ivecs_to_string(@); # private function
int do_find_goal($$$$); # private function
int find_goal($$); # private function
int do_reconfig_cpu2cpu($$$$); # private function
int do_reconfig_cpu($$$);
*/
/*
*
* What follow are the basic data structures routines of intrd.
*
* getstat() is responsible for reading the kstats and generating a "stat" hash.
*
* generate_delta() is responsible for taking two "stat" hashes and creating
* a new "delta" hash that represents what has changed over time.
*
* compress_deltas() is responsible for taking a list of deltas and generating
* a single delta hash that encompasses all the time periods described by the
* deltas.
*/
/*
*
* getstat() is handed a reference to a kstat and generates a hash, returned
* by reference, containing all the fields from the kstats which we need.
* If it returns the scalar 0, it failed to gather the kstats, and the caller
* should react accordingly.
*
* getstat() is also responsible for maintaining a reasonable $sleeptime.
*
* {"snaptime"} kstat's snaptime
* {<cpuid>} one hash reference per online cpu
* ->{"tot"} == cpu:<cpuid>:sys:cpu_nsec_{user + kernel + idle}
* ->{"crtime"} == cpu:<cpuid>:sys:crtime
* ->{"ivecs"}
* ->{<cookie#>} iterates over pci_intrs::<nexus>:cookie
* ->{"time"} == pci_intrs:<ivec#>:<nexus>:time (in nsec)
* ->{"pil"} == pci_intrs:<ivec#>:<nexus>:pil
* ->{"crtime"} == pci_intrs:<ivec#>:<nexus>:crtime
* ->{"ino"} == pci_intrs:<ivec#>:<nexus>:ino
* ->{"num_ino"} == num inos of single device instance sharing this entry
* Will be > 1 on pcplusmp X86 systems for devices
* with multiple MSI interrupts.
* ->{"buspath"} == pci_intrs:<ivec#>:<nexus>:buspath
* ->{"name"} == pci_intrs:<ivec#>:<nexus>:name
* ->{"ihs"} == pci_intrs:<ivec#>:<nexus>:ihs
*
*/
int getstat(kstat_ctl_t *kc, intr_stat_t *stat)
{
int cpucnt = 0;
kstat_t *ksp;
hrtime_t minsnap, maxsnap;
/* Hash of hash which matches (MSI device, ino) combos to kstats. */
msi_dev_t *msidevs;
/*
* kstats are not generated atomically. Each kstat hierarchy will
* have been generated within the kernel at a different time. On a
* thrashing system, we may not run quickly enough in order to get
* coherent kstat timing information across all the kstats. To
* determine if this is occurring, $minsnap/$maxsnap are used to
* find the breadth between the first and last snaptime of all the
* kstats we access. $maxsnap - $minsnap roughly represents the
* total time taken up in getstat(). If this time approaches the
* time between snapshots, our results may not be useful.
*/
minsnap = -1; /* snaptime is always a positive number */
maxsnap = minsnap;
/*
* iterate over the cpus in cpu:<cpuid>::. check
* cpu_info:<cpuid>:cpu_info<cpuid>:state to make sure the
* processor is "on-line". if not, it isn't accepting interrupts
* and doesn't concern us.
*
* record cpu:<cpuid>:sys:snaptime, and check $minsnap/$maxsnap.
*/
cpu_stats = stat->cpus;
bzero(cpu_stats, sizeof (cpu_stat_t) * max_cpus);
for (ksp = kc->kc_chain; ksp != null; ksp = ksp->ks_next) {
kstat_t *ksp_sys;
kstat_named_t *knp;
int cpu;
hrtime_t snaptime;
if ((ksp->ks_type != kstat_type_named) ||
strcmp(ksp->ks_module, "cpu_info") ||
(kstat_read(kc, ksp) == -1)) {
continue;
}
knp = kstat_data_lookup(ksp, "state");
if ((knp == NULL) || strcmp(knp->name, PS_ONLINE) ||
((cpu = ksp->ks_instance) >= max_cpus)) {
continue;
}
ksp_sys = kstat_lookup(kc, "cpu", cpu, "sys");
if ((ksp_sys == NULL) || (kstat_read(kc, ksp_sys) == -1)) {
continue;
}
cpu_stats[cpu].state = P_ONLINE;
knp = ksp_sys->ks_data;
for (i = 0; i < ksp_sys->ks_ndata; i++) {
if ((strcmp(knp[i].name, "cpu_nsec_idle") == 0) ||
(strcmp(knp[i].name, "cpu_nsec_user") == 0) ||
(strcmp(knp[i].name, "cpu_nsec_kernel") == 0)) {
cpu_stats[cpu].tot += knp[i].value.ui64;
}
cpu_stats[cpu].crtime = ksp_sys->crtime;
snaptime = ksp_sys->snaptime;
if (minsnap == -1 || snaptime < minsnap) {
minsnap = snaptime;
}
if (snaptime > maxsnap) {
maxsnap = snaptime;
}
cpucnt++;
}
if (cpucnt <= 1) {
sleeptime = ONECPU_SLEEPTIME;
return 0; /* nothing to do with 1 CPU */
}
/*
* Iterate over the ivecs. If the cpu is not on-line, ignore the
* ivecs mapped to it, if any.
*
* Record pci_intrs:{inum}:<nexus>:time, snaptime, crtime, pil,
* ino, name, and buspath. Check $minsnap/$maxsnap.
*/
for (ksp = kc->kc_chain; ksp != NULL; ksp = ksp->ks_next) {
kstat_named_t *knp;
int cpu;
int ino;
cpu_stat_t *cpup;
bus_dev_t *busp;
bus_dev_t *bus_last;
ivec_t *ivecp;
hrtime_t snaptime;
if ((ksp->ks_type != KSTAT_TYPE_NAMED) ||
strcmp(ksp->ks_module, "pci_intrs") ||
(kstat_read(kc, ksp) == -1)) {
continue;
}
knp = kstat_data_lookup(ksp, "cpu");
if ((knp == NULL) || ((cpu = knp->value.ui32) >= max_cpus) ||
(cpu_stats[cpu].state != P_ONLINE)) {
continue;
}
cpup = &cpu_stats[cpu];
knp = kstat_data_lookup(ksp, "type");
if ((knp == NULL) || strcmp(knp->value.c, "disabled")) {
continue;
}
knp = kstat_data_lookup(ksp, "buspath");
if (knp == NULL) {
continue;
}
for (bus_last = NULL, busp = cpup->bus_head; busp != NULL;
bus_last = busp, busp = busp->next) {
if (strcmp(knp->value.c, busp->buspath) == 0) {
break;
}
}
if (busp == NULL) {
busp = malloc(sizeof (bus_dev_t));
if (busp == NULL) {
return -1;
}
busp->next = NULL;
strlcpy(busp->buspath, knp->value.c, MAXPATHLEN);
busp->is_pcplusmp =
intrinfo(busp->buspath, &(busp->num_intr));
busp->ivecs = malloc(sizeof (ivec_t) * busp->num_intr);
if (busp->ivecs == NULL) {
free(busp);
return -1;
}
bzero(busp->ivecs, sizeof (ivec_t) * busp->num_intr);
if (bus_last == NULL) {
cpup->bus_head = busp;
} else {
bus_last->next = busp;
}
}
knp = kstat_data_lookup(ksp, "ino");
if ((knp == NULL) ||
((ino = knp->value.ui32) >= busp->num_intr)) {
continue;
}
ivecp = &(busp->ivecs[ino]);
knp = kstat_data_lookup(ksp, "time");
if (knp == NULL) {
continue;
}
ivecp->time = knp->value.ui64;
if (busp->is_pcplusmp) {
knp = kstat_data_lookup(ksp, "type");
if (knp == NULL) {
continue;
}
if (strcmp(knp->value.c, "msi") == 0) {
for (msi_last = NULL, msip = busp->msi_head;
msip != NULL;
msi_last = msip, msip = msip->next) {
if (ivecp->cookie == msip->cookie) {
break;
}
}
}
ivecp->num_ino = 1;
ivecp->crtime = ksp->crtime;
snaptime = ksp->snaptime;
if (minsnap == -1 || snaptime < minsnap) {
minsnap = snaptime;
}
if (snaptime > maxsnap) {
maxsnap = snaptime;
}
}
/*
* All MSI interrupts of a device instance share a single MSI address.
* On X86 systems with an APIC, this MSI address is interpreted as CPU
* routing info by the APIC. For this reason, on these platforms, all
* interrupts for MSI devices must be moved to the same CPU at the same
* time.
*
* Since all interrupts will be on the same CPU on these platforms, all
* interrupts can be consolidated into one ivec entry. For such devices,
* num_ino will be > 1 to denote that a group move is needed.
*/
/*
* Loop thru all MSI devices on X86 pcplusmp systems.
* Nop on other systems.
*/
/*
* We define the timerange as the amount of time spent gathering the
* various kstats, divided by our sleeptime. If we take a lot of time
* to access the kstats, and then we create a delta comparing these
* kstats with a prior set of kstats, that delta will cover
* substaintially different amount of time depending upon which
* interrupt or CPU is being examined.
*
* By checking the timerange here, we guarantee that any deltas
* created from these kstats will contain self-consistent data,
* in that all CPUs and interrupts cover a similar span of time.
*
* $timerange_toohi is the upper bound. Any timerange above
* this is thrown out as garbage. If the stat is safely within this
* bound, we treat the stat as representing an instant in time, rather
* than the time range it actually spans. We arbitrarily choose minsnap
* as the snaptime of the stat.
*/
stat->snaptime = minsnap;
if (((maxsnap - minsnap) / sleeptime) > timerange_toohi) {
return 0;
}
return 1;
}