サブロウ丸

Sabrou-mal サブロウ丸

主にプログラミングと数学

Summarize ibstat information of all servers in multi-node environment

ibstat command provides us detailed information about the InfiniBand adapter connected to the server. I'd like to share a script that outputs the ibstat information of all servers in a multi-node environment in CSV format.

import subprocess
import csv
import sys

username = "YOUR_USERNAME"

servers = [
    "xxxxx",
    "yyyyy",
    "zzzzz",
    "your_server_names",
]


def exec_command(commands):
    process = subprocess.Popen(
        commands,
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
    )
    stdout, stderr = process.communicate()
    return stdout.decode().strip().split("\n")


def get_devices(server):
    output = exec_command(("ssh", f"{username}@{server}", "'ibstat'", "-l"))
    return output


def get_device_data(server, device):
    # get standard output of ibstat
    output = exec_command(("ssh", f"{username}@{server}", "'ibstat'", f"{device}"))

    # Convert the given string to a structured JSON format
    data = {"server": server, "device": device}
    current_port = None

    for line in output:
        if line.startswith(f"\tPort"):
            current_port = line.strip().split(":")[0].split()[1]
            if not "port" in data:
                data["port"] = dict()
            data["port"][current_port] = dict()
        elif line.startswith("\t\t"):
            assert current_port is not None
            key, value = map(lambda x: x.strip(), line.split(":"))
            data["port"][current_port][key.replace(" ", "_")] = value
        elif line.startswith("\t"):
            key, value = map(lambda x: x.strip(), line.split(":"))
            data[key.replace(" ", "_")] = value

    return data


def get_server_data(server):
    devices = get_devices(server)
    data = [run_ibstat_and_parse(server, device) for device in devices]
    return data


def write_data(row, writer):
    port_details = row["port"]["1"]  # Assuming each CA has "Port 1"
    writer.writerow(
        [
            row["server"],
            row["device"],
            port_details["State"],
            port_details["Base_lid"],
            port_details["SM_lid"],
            port_details["Port_GUID"],
        ]
    )


def main():
    writer = csv.writer(sys.stdout)
    writer.writerow(["server", "device", "State", "Base_lid", "SM_lid", "Port_GUID"])

    for server in servers:
        for device_data in get_device_data(server):
            write_data(device_data, writer)


if __name__ == "__main__":
    main()