#!/usr/bin/env python

# SPDX-FileCopyrightText: Copyright (c) 2021-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
#
# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
# property and proprietary rights in and to this material, related
# documentation and any modifications thereto. Any use, reproduction,
# disclosure or distribution of this material and related documentation
# without an express license agreement from NVIDIA CORPORATION or
# its affiliates is strictly prohibited.

import nsysstats

class CUDAKernelExecSummary(nsysstats.StatsReport):

    display_name = 'DEPRECATED - Use cuda_kern_exec_sum instead'
    usage = '{SCRIPT} -- {{DISPLAY_NAME}}'
    should_display = False

    query_stub = """
WITH
    runkern AS (
        SELECT
            (r.globalTid >> 24) & 0x00FFFFFF AS pid,
            r.globalTid & 0x00FFFFFF AS tid,
            k.deviceId AS deviceId,
            r.end - r.start AS ApiDur,
            iif(k.start - r.end >= 0, k.start - r.end, NULL) AS QueDur,
            k.end - k.start AS KernDur,
            max(r.end, k.end) - r.start AS totalDur,
            CASE substr(rname.value, -6, 2)
                WHEN '_v'
                    THEN substr(rname.value, 1, length(rname.value)-6)
                ELSE rname.value
            END AS apiName,
            kname.value AS kernName
        FROM
            CUPTI_ACTIVITY_KIND_KERNEL AS k
        JOIN
            CUPTI_ACTIVITY_KIND_RUNTIME AS r
            ON      k.correlationId == r.correlationId
                AND k.globalPid == (r.globalTid & 0xFFFFFFFFFF000000)
        LEFT JOIN
            StringIds AS rname
            ON r.nameId == rname.id
        LEFT JOIN
            StringIds AS kname
            ON kname.id == coalesce(k.{NAME_COL_NAME}, k.demangledName)
    )

SELECT
    pid AS PID,  -- 1
    tid AS TID,
    deviceId AS DevId,

    count(*) AS Count,
    count(QueDur) AS QCount, -- 5

    round(avg(totalDur), 1) AS "TAvg:dur_ns",
    round(median(totalDur), 1) AS "TMed:dur_ns",
    min(totalDur) AS "TMin:dur_ns",
    max(totalDur) AS "TMax:dur_ns",
    round(stdev(totalDur), 1) AS "TStdDev:dur_ns", -- 10

    round(avg(ApiDur), 1) AS "AAvg:dur_ns",
    round(median(ApiDur), 1) AS "AMed:dur_ns",
    min(ApiDur) AS "AMin:dur_ns",
    max(ApiDur) AS "AMax:dur_ns",
    round(stdev(ApiDur), 1) AS "AStdDev:dur_ns", -- 15

    round(avg(QueDur), 1) AS "QAvg:dur_ns",
    round(median(QueDur), 1) AS "QMed:dur_ns",
    min(QueDur) AS "QMin:dur_ns",
    max(QueDur) AS "QMax:dur_ns",
    round(stdev(QueDur), 1) AS "QStdDev:dur_ns", -- 20

    round(avg(KernDur), 1) AS "KAvg:dur_ns",
    round(median(KernDur), 1) AS "KMed:dur_ns",
    min(KernDur) AS "KMin:dur_ns",
    max(KernDur) AS "KMax:dur_ns",
    round(stdev(KernDur), 1) AS "KStdDev:dur_ns", -- 25

    apiName AS "API Name",
    kernName AS "Kernel Name" -- 27
FROM runkern
GROUP BY 1, 2, 3, 26, 27
ORDER BY 6 DESC
"""

    table_checks = {
        'CUPTI_ACTIVITY_KIND_KERNEL':
            "{DBFILE} does not contain CUDA kernel data.",
        'CUPTI_ACTIVITY_KIND_RUNTIME':
            "{DBFILE} does not contain CUDA API data.",
    }

    _arg_opts = [
        [['base'],    {'action': 'store_true'}],
        [['mangled'], {'action': 'store_true'}],
    ]

    def setup(self):
        err = super().setup()
        if err != None:
            return err

        name_col_name = 'demangledName'
        if self.parsed_args.base:
            name_col_name = 'shortName'
        elif (self.parsed_args.mangled and
            self.table_col_exists('CUPTI_ACTIVITY_KIND_KERNEL', 'mangledName')):
            name_col_name = 'mangledName'

        self.query = self.query_stub.format(NAME_COL_NAME = name_col_name)

if __name__ == "__main__":
    CUDAKernelExecSummary.Main()
