src/import/chips/p9/procedures/hwp/memory/lib/phy/cal_timers.H


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259

/* IBM_PROLOG_BEGIN_TAG                                                   */
/* This is an automatically generated prolog.                             */
/*                                                                        */
/* $Source: src/import/chips/p9/procedures/hwp/memory/lib/phy/cal_timers.H $ */
/*                                                                        */
/* OpenPOWER HostBoot Project                                             */
/*                                                                        */
/* Contributors Listed Below - COPYRIGHT 2015,2016                        */
/* [+] International Business Machines Corp.                              */
/*                                                                        */
/*                                                                        */
/* Licensed under the Apache License, Version 2.0 (the "License");        */
/* you may not use this file except in compliance with the License.       */
/* You may obtain a copy of the License at                                */
/*                                                                        */
/*     http://www.apache.org/licenses/LICENSE-2.0                         */
/*                                                                        */
/* Unless required by applicable law or agreed to in writing, software    */
/* distributed under the License is distributed on an "AS IS" BASIS,      */
/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or        */
/* implied. See the License for the specific language governing           */
/* permissions and limitations under the License.                         */
/*                                                                        */
/* IBM_PROLOG_END_TAG                                                     */

///
/// @file cal_timers.H
/// @brief Subroutines to calculate the duration of training operations
///
// *HWP HWP Owner: Brian Silver <bsilver@us.ibm.com>
// *HWP HWP Backup: Andre Marin <aamarin@us.ibm.com>
// *HWP Team: Memory
// *HWP Level: 2
// *HWP Consumed by: FSP:HB

#ifndef _MSS_CAL_TIMERS_H_
#define _MSS_CAL_TIMERS_H_

#include <fapi2.H>
#include <lib/shared/mss_const.H>
#include <lib/utils/conversions.H>
#include <lib/utils/poll.H>

namespace mss
{

///
/// @brief Return the cycles taken for write leveling on a single rank
/// @tparam T the target type used to get attributes
/// @param[in] i_target used to get attributes
/// @return uint64_t, the cycles taken for write leveling on a single rank
///
template<fapi2::TargetType T>
uint64_t wr_lvl_cycles(const fapi2::Target<T>& i_target )
{
    const uint64_t TWLO_TWLOE = mss::twlo_twloe(i_target);

    // This step runs for approximately (80 + TWLO_TWLOE) x NUM_VALID_SAMPLES x (384/(BIG_STEP + 1) +
    // (2 x (BIG_STEP + 1))/(SMALL_STEP + 1)) + 20 memory clock cycles per rank.

    const uint64_t l_wr_lvl_cycles = (80 + TWLO_TWLOE) * WR_LVL_NUM_VALID_SAMPLES * (384 / (WR_LVL_BIG_STEP + 1) +
                                     (2 * (WR_LVL_BIG_STEP + 1)) / (WR_LVL_SMALL_STEP + 1)) + 20;
    FAPI_DBG("wr_lvl_cycles: %llu(%lluns) (%llu, %llu, %llu, %llu)", l_wr_lvl_cycles, mss::cycles_to_ns(i_target,
             l_wr_lvl_cycles),
             TWLO_TWLOE, WR_LVL_NUM_VALID_SAMPLES, WR_LVL_BIG_STEP, WR_LVL_SMALL_STEP);

    return l_wr_lvl_cycles;
}

///
/// @brief Return the cycles taken for DQS align on a single rank
/// @tparam T the target type used to get attributes
/// @param[in] i_target used to get attributes
/// @return uint64_t, the cycles taken for DQS align on a single rank
///
template<fapi2::TargetType T>
uint64_t dqs_align_cycles(const fapi2::Target<T>& i_target )
{
    // This step runs for approximately 6 x 600 x 4 DRAM clocks per rank pair.
    const uint64_t l_dqs_align_cycles = 6 * 600 * 4;

    FAPI_DBG("dqs_align_cycles: %llu(%lluns)", l_dqs_align_cycles, mss::cycles_to_ns(i_target, l_dqs_align_cycles));
    return l_dqs_align_cycles;
}

///
/// @brief Return the cycles taken for RD clock align on a single rank
/// @tparam T the target type used to get attributes
/// @param[in] i_target used to get attributes
/// @return uint64_t, the cycles taken for RD clock align on a single rank
///
template<fapi2::TargetType T>
uint64_t rdclk_align_cycles(const fapi2::Target<T>& i_target )
{
    // This step runs for approximately 24 x ((1024/COARSE_CAL_STEP_SIZE + 4 x COARSE_CAL_STEP_SIZE) x 4 + 32) DRAM
    // clocks per rank pair
    const uint64_t l_rdclk_align_cycles = 24 * ((1024 / COARSE_CAL_STEP_SIZE + 4 * COARSE_CAL_STEP_SIZE) * 4 + 32);
    FAPI_DBG("rdclk_align_cycles: %llu(%lluns) (%llu)", l_rdclk_align_cycles,
             mss::cycles_to_ns(i_target, l_rdclk_align_cycles), COARSE_CAL_STEP_SIZE);
    return l_rdclk_align_cycles;
}

///
/// @brief Return the cycles taken for read centering on a single rank
/// @tparam T the target type used to get attributes
/// @param[in] i_target used to get attributes
/// @return uint64_t, the cycles taken for read centering on a single rank
///
template<fapi2::TargetType T>
uint64_t read_ctr_cycles(const fapi2::Target<T>& i_target )
{
    // This step runs for approximately 6 x (512/COARSE_CAL_STEP_SIZE + 4 x (COARSE_CAL_STEP_SIZE +
    // 4 x CONSEQ_PASS)) x 24 DRAM clocks per rank pair.

    const uint64_t l_read_ctr_cycles = 6 * (512 / COARSE_CAL_STEP_SIZE + 4 * (COARSE_CAL_STEP_SIZE + 4 * CONSEQ_PASS)) * 24;
    FAPI_DBG("read_ctr_cycles %llu(%lluns) (%llu, %llu)", l_read_ctr_cycles, mss::cycles_to_ns(i_target, l_read_ctr_cycles),
             COARSE_CAL_STEP_SIZE, CONSEQ_PASS);
    return l_read_ctr_cycles;
}

///
/// @brief Return the cycles taken for write centering on a single rank
/// @tparam T the target type used to get attributes
/// @param[in] i_target used to get attributes
/// @return uint64_t, the cycles taken for write centering on a single rank
///
template<fapi2::TargetType T>
uint64_t write_ctr_cycles(const fapi2::Target<T>& i_target )
{
    // 1000 + (NUM_VALID_SAMPLES * (FW_WR_RD + FW_RD_WR + 16) *
    // (1024/(SMALL_STEP +1) + 128/(BIG_STEP +1)) + 2 * (BIG_STEP+1)/(SMALL_STEP+1)) x 24 DRAM
    // clocks per rank pair.
    uint64_t l_write_ctr_cycles = 1000 + (WR_LVL_NUM_VALID_SAMPLES * (WR_CNTR_FW_WR_RD + WR_CNTR_FW_RD_WR + 16) *
                                          (1024 / (WR_LVL_SMALL_STEP + 1) + 128 / (WR_LVL_BIG_STEP + 1)) + 2 *
                                          (WR_LVL_BIG_STEP + 1) / (WR_LVL_SMALL_STEP + 1)) * 24;

    FAPI_DBG("write_ctr_cycles: %lu(%luns) (%u, %u, %u, %u, %u)",
             l_write_ctr_cycles, mss::cycles_to_ns(i_target, l_write_ctr_cycles),
             WR_LVL_NUM_VALID_SAMPLES, WR_CNTR_FW_WR_RD, WR_CNTR_FW_RD_WR, WR_LVL_BIG_STEP, WR_LVL_SMALL_STEP);

    return l_write_ctr_cycles;
}

///
/// @brief Return the cycles taken for coarse write on a single rank
/// @tparam T the target type used to get attributes
/// @param[in] i_target used to get attributes
/// @return uint64_t, the cycles taken for coarse write on a single rank
///
template<fapi2::TargetType T>
uint64_t coarse_wr_cycles(const fapi2::Target<T>& i_target )
{
    // The run length given here is the maximum run length for this calibration algorithm.
    // This step runs for approximately 40 DRAM clocks per rank pair.
    static const uint64_t l_coarse_wr_cycles = 40;
    FAPI_DBG("coarse_wr_cycles: %llu(%lluns)", l_coarse_wr_cycles, mss::cycles_to_ns(i_target, l_coarse_wr_cycles));
    return l_coarse_wr_cycles;
}

///
/// @brief Return the cycles taken for coarse read on a single rank
/// @tparam T the target type used to get attributes
/// @param[in] i_target used to get attributes
/// @return uint64_t, the cycles taken for coarse rd on a single rank
///
template<fapi2::TargetType T>
uint64_t coarse_rd_cycles(const fapi2::Target<T>& i_target )
{
    // The run length given here is the maximum run length for this calibration algorithm.
    // This step runs for approximately 32 DRAM clocks per rank pair.
    static const uint64_t l_coarse_rd_cycles = 32;
    FAPI_DBG("coarse_rd_cycles: %llu(%lluns)", l_coarse_rd_cycles, mss::cycles_to_ns(i_target, l_coarse_rd_cycles));
    return l_coarse_rd_cycles;
}

///
/// @brief Configure the polling intervals with the proper timeings based on the cal steps enabled
/// @tparam T the fapi2::TargetType of the port
/// @param[in] i_target the port target
/// @param[in] i_poll the poll_parameters to update
/// @param[in] i_cal_steps_enabled a fapi2::buffer<uint8_t> representing the cal steps enabled
/// @return FAPI2_RC_SUCCESS iff everything is OK
///
template<fapi2::TargetType T>
inline fapi2::ReturnCode cal_timer_setup(const fapi2::Target<T>& i_target,
        poll_parameters& i_poll,
        const fapi2::buffer<uint16_t>& i_cal_steps_enabled)
{
    // First, calculate the total number of cycles this cal should take if everything
    // runs to completion
    uint64_t l_total_cycles = i_cal_steps_enabled.getBit<WR_LEVEL>() ? wr_lvl_cycles(i_target) : 0;
    l_total_cycles += i_cal_steps_enabled.getBit<DQS_ALIGN>()   ? dqs_align_cycles(i_target)   : 0;
    l_total_cycles += i_cal_steps_enabled.getBit<RDCLK_ALIGN>() ? rdclk_align_cycles(i_target) : 0;
    l_total_cycles += i_cal_steps_enabled.getBit<READ_CTR>()    ? read_ctr_cycles(i_target)    : 0;
    l_total_cycles += i_cal_steps_enabled.getBit<WRITE_CTR>()   ? write_ctr_cycles(i_target)   : 0;
    l_total_cycles += i_cal_steps_enabled.getBit<COARSE_WR>()   ? coarse_wr_cycles(i_target)   : 0;
    l_total_cycles += i_cal_steps_enabled.getBit<COARSE_RD>()   ? coarse_rd_cycles(i_target)   : 0;

    // Now we have to decide if we're going to abort on an error or keep going. If we keep going,
    // then we want our initial delay to be the expected time to completion - we don't have much
    // of a choice. If we abort on error then we want to have a small initial delay, and poll more
    // times with longer durations between the polling.

#define THRASH_CCS_IP_IN_SIM 1
#ifdef THRASH_CCS_IP_IN_SIM

    i_poll.iv_initial_delay = mss::cycles_to_ns(i_target, 1);
    i_poll.iv_initial_sim_delay = mss::cycles_to_simcycles(1);

    i_poll.iv_delay = DELAY_1US;
    i_poll.iv_sim_delay = mss::cycles_to_simcycles(mss::ns_to_cycles(i_target, DELAY_1US));

    // Round up to the cycles left after the initial delay
    int64_t l_ns_left = std::max(0L,
                                 (int64_t(mss::cycles_to_ns(i_target, l_total_cycles)) - int64_t(i_poll.iv_initial_delay)));
    i_poll.iv_poll_count = l_ns_left / i_poll.iv_delay;
    i_poll.iv_poll_count += l_ns_left % i_poll.iv_delay ? 0 : 1;

    // Don't let the poll count be 0 - that just makes for a bad day.
    i_poll.iv_poll_count = (i_poll.iv_poll_count == 0) ? 1 : i_poll.iv_poll_count;
#else

    if (CAL_ABORT_ON_ERROR)
    {
        // We don't want to wait too long for the initial check, just some hueristics here
        i_poll.iv_initial_delay = mss::cycles_to_ns(i_target, l_total_cycles / 8);
        i_poll.iv_initial_sim_delay = mss::cycles_to_simcycles(l_total_cycles / 8);

        // Delay 1us between polls, and setup the poll count so
        // iv_initial_delay + (iv_delay * iv_poll_count) == l_total_cycles;
        i_poll.iv_delay = DELAY_1US;
        i_poll.iv_sim_delay = mss::cycles_to_simcycles(mss::ns_to_cycles(i_target, DELAY_1US));

        // Round up to the cycles left after the initial delay
        int64_t l_ns_left = std::max(0L,
                                     (int64_t(mss::cycles_to_ns(i_target, l_total_cycles)) - int64_t(i_poll.iv_initial_delay)));
        i_poll.iv_poll_count = l_ns_left / i_poll.iv_delay;
        i_poll.iv_poll_count += l_ns_left % i_poll.iv_delay ? 0 : 1;

        // Don't let the poll count be 0 - that just makes for a bad day.
        i_poll.iv_poll_count = (i_poll.iv_poll_count == 0) ? 1 : i_poll.iv_poll_count;
    }
    else
    {
        i_poll.iv_initial_delay = mss::cycles_to_ns(i_target, l_total_cycles);
        i_poll.iv_initial_sim_delay = mss::cycles_to_simcycles(l_total_cycles);
    }

#endif

    FAPI_INF("cal abort on error? %s. tc: %luc, id: %luns(%lusc), d: %lu(%lusc), pc: %lu",
             (CAL_ABORT_ON_ERROR ? "yup" : "nope"), l_total_cycles, i_poll.iv_initial_delay,
             i_poll.iv_initial_sim_delay, i_poll.iv_delay, i_poll.iv_sim_delay, i_poll.iv_poll_count);

    return fapi2::FAPI2_RC_SUCCESS;
}

}
#endif