# HG changeset patch # User Albert Lee # Date 1271702826 14400 # Node ID 380ada8fd6211400c31816692b3983ac68b10807 # Parent ee32231c211b1cef049ca55dc5ec53bd3d8d8821 Start of intrd implementation. diff -r ee32231c211b -r 380ada8fd621 intrd.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/intrd.c Mon Apr 19 14:47:06 2010 -0400 @@ -0,0 +1,419 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "intrs.h" + +typedef struct ivec { + int cookie; + hrtime_t time; + hrtime_t crtime; + int pil; + int ino; + int ihs; + int num_ino; + int origcpu; + int nowcpu; + int inum; +} ivec_t; + +typedef struct bus_stat { + bus_stat_t *next; + char *buspath[MAXPATHLEN]; + int num_intr; + ivec_t *ivecs; +} bus_stat_t; + +typedef struct cpu_stat { + int state; + uint64_t tot; + hrtime_t crtime; + bus_stat_t *bus_stats; +} cpu_stat_t; + +type def struct intr_stat { + double snaptime; + *cpu_stat_t *cpus; +} intr_stat_t; + +typedef enum sleeptime { + NORMAL_SLEEPTIME = 10, /* time to sleep between samples */ + IDLE_SLEEPTIME = 45, /* time to sleep when idle */ + ONECPU_SLEEPTIME = 60 * 15, /* used if only 1 CPU on system */ +} sleeptime_t; + +int using_scengen; /* 1 if using scenario simulator */ +int debug; +int foreground; + +int max_cpus; + +sleeptime_t sleeptime = NORMAL_SLEEPTIME; /* either normal_ or idle_ or onecpu_ */ + +float idle_intrload = 0.1; /* idle if interrupt load < 10% */ + +float timerange_toohi = 0.1; +int statslen = 60; /* time period (in secs) to keep in @deltas */ + +int main(int argc, char **argv) +{ + const char *cmdname; + kstat_ctl_t *kc; + kstat_t *ksp; + intr_stat_t stat; + + max_cpus = sysconf(_SC_CPUID_MAX) + 1; + + cmdname = basename(argv[0]); +/* + * Parse arguments. intrd does not accept any public arguments; the two + * arguments below are meant for testing purposes. -D generates a significant + * amount of syslog output. -S loads the filename as a perl + * script. That file is expected to implement a kstat "simulator" which + * can be used to feed information to intrd and verify intrd's responses. +*/ + for (; --argc > 0; ++argv) { + if (argv[1][0] != '-' || argv[1][1] == '\0' || + argv[1][2] != '\0') { + continue; + } + + switch (argv[1][1]) { + case 'S': + using_scengen = 1; + foreground = 1; + if (argc > 1) { + --argc; + load_simulator(++argv[1]); + } + break; + case 'D': + debug = 1; + break; + case 'f': + foreground = 1; + break; + default: + } + } + + if (!foreground) { + if (daemon(0, 0) == -1) { + return 1; + } + } + + if (!using_scengen) { + openlog(cmdname, LOG_PID, LOG_DAEMON); + (void) setlogmask(LOG_UPTO(debug ? LOG_DEBUG : LOG_INFO)); + } + + if (!using_scengen) { + kc = kstat_open(); + if (kc == NULL) { + return 1; + } + } else { + /* scengen not implemented */ + return 1; + } + +/* + * If no pci_intrs kstats were found, we need to exit, but we can't because + * SMF will restart us and/or report an error to the administrator. But + * there's nothing an administrator can do. So print out a message to syslog + * and silently pause forever. +*/ + for (ksp = kc->kc_chain; ksp != NULL; ksp = ksp->ks_next) { + if ((ksp->ks_type == KSTAT_TYPE_NAMED) && + !strcmp(ksp->ks_module, "pci_intrs")) { + break; + } + } + if (ksp == NULL) { + kstat_close(kc); + syslog(LOG_INFO, "no interrupts were found: " \ + "your I/O bus may not yet be supported\n"); + do {} while (!sleep(ONECPU_SLEEPTIME)); + return 0; + } + + if ((stat.cpus = malloc(sizeof(cpu_stat_t) * max_cpus)) == NULL) { + return 1; + } +} + + +static int verify(int condition, const char *msg) +{ + int bad = !condition; + if (bad) { + syslog(LOG_DEBUG, "VERIFY: %s", msg); + } + return bad; +} + +static void load_simulator(const char *file) +{ +} + +static int getstat($$); +/* +int generate_delta($$); +int compress_deltas($); +int dumpdelta($); + +int goodness($); +int imbalanced($$); +int do_reconfig($); + +int goodness_cpu($$); # private function +int move_intr($$$$); # private function +int ivecs_to_string(@); # private function +int do_find_goal($$$$); # private function +int find_goal($$); # private function +int do_reconfig_cpu2cpu($$$$); # private function +int do_reconfig_cpu($$$); +*/ + + +/* +# + * What follow are the basic data structures routines of intrd. +# + * getstat() is responsible for reading the kstats and generating a "stat" hash. +# + * generate_delta() is responsible for taking two "stat" hashes and creating + * a new "delta" hash that represents what has changed over time. +# + * compress_deltas() is responsible for taking a list of deltas and generating + * a single delta hash that encompasses all the time periods described by the + * deltas. +*/ + + +/* +# + * getstat() is handed a reference to a kstat and generates a hash, returned + * by reference, containing all the fields from the kstats which we need. + * If it returns the scalar 0, it failed to gather the kstats, and the caller + * should react accordingly. +# + * getstat() is also responsible for maintaining a reasonable $sleeptime. +# + * {"snaptime"} kstat's snaptime + * {} one hash reference per online cpu + * ->{"tot"} == cpu::sys:cpu_nsec_{user + kernel + idle} + * ->{"crtime"} == cpu::sys:crtime + * ->{"ivecs"} + * ->{} iterates over pci_intrs:::cookie + * ->{"time"} == pci_intrs:::time (in nsec) + * ->{"pil"} == pci_intrs:::pil + * ->{"crtime"} == pci_intrs:::crtime + * ->{"ino"} == pci_intrs:::ino + * ->{"num_ino"} == num inos of single device instance sharing this entry + * Will be > 1 on pcplusmp X86 systems for devices + * with multiple MSI interrupts. + * ->{"buspath"} == pci_intrs:::buspath + * ->{"name"} == pci_intrs:::name + * ->{"ihs"} == pci_intrs:::ihs +# +*/ + +int getstat(kstat_ctl_t *kc, intr_stat_t *stat) +{ + int cpucnt = 0; + kstat_t *ksp; + double minsnap, maxsnap; + + /* Hash of hash which matches (MSI device, ino) combos to kstats. */ + int *msidevs; + + /* + * kstats are not generated atomically. Each kstat hierarchy will + * have been generated within the kernel at a different time. On a + * thrashing system, we may not run quickly enough in order to get + * coherent kstat timing information across all the kstats. To + * determine if this is occurring, $minsnap/$maxsnap are used to + * find the breadth between the first and last snaptime of all the + * kstats we access. $maxsnap - $minsnap roughly represents the + * total time taken up in getstat(). If this time approaches the + * time between snapshots, our results may not be useful. + */ + + minsnap = -1; /* snaptime is always a positive number */ + maxsnap = minsnap; + + /* + * iterate over the cpus in cpu:::. check + * cpu_info::cpu_info:state to make sure the + * processor is "on-line". if not, it isn't accepting interrupts + * and doesn't concern us. + * + * record cpu::sys:snaptime, and check $minsnap/$maxsnap. + */ + + cpu_stats = stat->cpus; + bzero(cpu_stats, sizeof(cpu_stat_t) * max_cpus); + + for (ksp = kc->kc_chain; ksp != null; ksp = ksp->ks_next) { + kstat_t *ksp_sys; + kstat_named_t *knp; + int cpu; + double snaptime; + + if ((ksp->ks_type != kstat_type_named) || + strcmp(ksp->ks_module, "cpu_info") || + (kstat_read(kc, ksp) == -1)) { + continue; + } + knp = kstat_data_lookup(ksp, "state"); + if ((knp == NULL) || strcmp(knp->name, PS_ONLINE)) { + continue; + } + cpu = ksp->ks_instance; + ksp_sys = kstat_lookup(kc, "cpu", cpu, "sys"); + if ((ksp_sys == NULL) || (kstat_read(kc, ksp_sys, NULL) == -1)) { + continue; + } + cpu_stats[cpu].state = P_ONLINE; + knp = ksp_sys->ks_data; + for (i = 0; i < ksp_sys->ks_ndata; i++) { + if (!strcmp(knp[i].name, "cpu_nsec_idle") || + !strcmp(knp[i].name, "cpu_nsec_user") || + !strcmp(knp[i].name, "cpu_nsec_kernel")) { + cpu_stats[cpu].tot += knp[i].value.ui64; + } + cpu_stats[cpu].crtime = ksp_sys->crtime; + snaptime = ksp_sys->snaptime; + if (minsnap == -1 || snaptime < minsnap) { + minsnap = snaptime; + } + if (snaptime > maxsnap) { + maxsnap = snaptime; + } + cpucnt++; + + } + + if (cpucnt <= 1) { + sleeptime = ONECPU_SLEEPTIME; + return 0; /* nothing to do with 1 CPU */ + } + + /* + * Iterate over the ivecs. If the cpu is not on-line, ignore the + * ivecs mapped to it, if any. + * + * Record pci_intrs:{inum}::time, snaptime, crtime, pil, + * ino, name, and buspath. Check $minsnap/$maxsnap. + */ + + for (ksp = kc->kc_chain; ksp != NULL; ksp = ksp->ks_next) { + kstat_named_t *knp; + int cpu; + double snaptime; + + if ((ksp->ks_type != KSTAT_TYPE_NAMED) || + strcmp(ksp->ks_module, "pci_intrs") || + (kstat_read(kc, ksp) == -1)) { + continue; + } + knp = kstat_data_lookup(ksp, "cpu"); + if ((knp == NULL) || ((cpu = knp->value.ui32) > max_cpus) || + (cpu_stats[cpu].state != P_ONLINE)) { + continue; + } + knp = kstat_data_lookup(ksp, "type"); + if ((knp == NULL) || strcmp(knp->value.c, "disabled")) { + continue; + } + knp = kstat_data_lookup(ksp, "buspath"); + if (knp == NULL) { + continue; + } + snaptime = ksp->snaptime; + if (minsnap == -1 || snaptime < minsnap) { + minsnap = snaptime; + } + if (snaptime > maxsnap) { + maxsnap = snaptime; + } + } + + /* + * All MSI interrupts of a device instance share a single MSI address. + * On X86 systems with an APIC, this MSI address is interpreted as CPU + * routing info by the APIC. For this reason, on these platforms, all + * interrupts for MSI devices must be moved to the same CPU at the same + * time. + * + * Since all interrupts will be on the same CPU on these platforms, all + * interrupts can be consolidated into one ivec entry. For such devices, + * num_ino will be > 1 to denote that a group move is needed. + */ + + /* + * Loop thru all MSI devices on X86 pcplusmp systems. + * Nop on other systems. + */ + + + /* + * We define the timerange as the amount of time spent gathering the + * various kstats, divided by our sleeptime. If we take a lot of time + * to access the kstats, and then we create a delta comparing these + * kstats with a prior set of kstats, that delta will cover + * substaintially different amount of time depending upon which + * interrupt or CPU is being examined. + * + * By checking the timerange here, we guarantee that any deltas + * created from these kstats will contain self-consistent data, + * in that all CPUs and interrupts cover a similar span of time. + * + * $timerange_toohi is the upper bound. Any timerange above + * this is thrown out as garbage. If the stat is safely within this + * bound, we treat the stat as representing an instant in time, rather + * than the time range it actually spans. We arbitrarily choose minsnap + * as the snaptime of the stat. + */ + + stat->snaptime = minsnap; + if (((maxsnap - minsnap) / sleeptime) > timerange_toohi) { + return 0; + } + return 1; +} +