summaryrefslogtreecommitdiffstats
path: root/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr.H
blob: f1c072eea41f1cee6fb8ba4331ae86f52ee0eedc (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
/* IBM_PROLOG_BEGIN_TAG                                                   */
/* This is an automatically generated prolog.                             */
/*                                                                        */
/* $Source: src/usr/diag/prdf/plat/mem/prdfMemTdCtlr.H $                  */
/*                                                                        */
/* OpenPOWER HostBoot Project                                             */
/*                                                                        */
/* Contributors Listed Below - COPYRIGHT 2016,2019                        */
/* [+] International Business Machines Corp.                              */
/*                                                                        */
/*                                                                        */
/* Licensed under the Apache License, Version 2.0 (the "License");        */
/* you may not use this file except in compliance with the License.       */
/* You may obtain a copy of the License at                                */
/*                                                                        */
/*     http://www.apache.org/licenses/LICENSE-2.0                         */
/*                                                                        */
/* Unless required by applicable law or agreed to in writing, software    */
/* distributed under the License is distributed on an "AS IS" BASIS,      */
/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or        */
/* implied. See the License for the specific language governing           */
/* permissions and limitations under the License.                         */
/*                                                                        */
/* IBM_PROLOG_END_TAG                                                     */

/** @file  prdfMemTdCtlr.H
 *  @brief A state machine for memory Targeted Diagnostics (TD).
 */

#ifndef __prdfMemTdCtlr_H
#define __prdfMemTdCtlr_H

// Framework includes
#include <iipServiceDataCollector.h>

// Platform includes
#include <prdfMemTdQueue.H>
#include <prdfMemTdRankList.H>
#include <prdfPlatServices.H>

namespace PRDF
{

/**
 * @brief A state machine for memory Targeted Diagnostics (TD).
 */
template <TARGETING::TYPE T>
class MemTdCtlr
{
  public:

    MemTdCtlr() = delete; // Don't allow default contructor

    /**
     * @brief Constructor
     *
     * This contructor will only be called in the MCBIST, MBA, or OCMB data
     * bundle, which already checks for a valid type.
     *
     * Need to initialize iv_stoppedRank to a valid entry in iv_rankList. Use
     * the last entry in the list so that the 'next' rank is the first entry
     * in the list.
     *
     * @param i_chip An MCBIST, MBA, or OCMB chip.
     */
    explicit MemTdCtlr( ExtensibleChip * i_chip ) :
        iv_chip( i_chip ), iv_rankList( i_chip ),
        iv_stoppedRank( iv_rankList.getList().back() )
    {
        PRDF_ASSERT( T == iv_chip->getType() );
    }

    /**
     * @brief  Determines and executes the next course of action after a
     *         maintenance command complete attention.
     * @note   Initializes the TD controller, if needed.
     * @param  io_sc The step code data struct.
     * @return Non-SUCCESS if an internal function fails, SUCCESS otherwise.
     */
    uint32_t handleCmdComplete( STEP_CODE_DATA_STRUCT & io_sc );

    /**
     * @brief  This only pushes a new TdEntry to the back of iv_queue.
     * @post   Fetch attentions must also call handleTdEvent() to trigger
     *         diagnostics, if not already in progress.
     * @param  i_entry  The new TD queue entry.
     */
    void pushToQueue( TdEntry * i_entry )
    {
        #ifdef __HOSTBOOT_RUNTIME
        if ( TdEntry::TPS_EVENT == i_entry->getType() &&
             isTpsBanned(i_entry->getChip(), i_entry->getRank()) )
        {
            PRDF_ERR( "[MemTdCtlr::pushToQueue] TPS banned on 0x%08x 0x%02x",
                      i_entry->getChip()->getHuid(), i_entry->getRank() );
            return; // prevent the entry from being added to the queue.
        }
        #endif

        iv_queue.push(i_entry);
    }

    #ifdef __HOSTBOOT_RUNTIME

    /**
     * @brief  This tells the TD controller there was a TdEntry added to the
     *         queue (via pushToQueue) because of a fetch attention and
     *         additional processing may be needed to start the next TD
     *         procedure. If there isn't a current TD procedure in progress,
     *         this function will stop background scrubbing and starts the first
     *         procedure in the queue.
     *
     * @pre    A TdEntry must be added to the queue (via pushToQueue) before
     *         calling this function.
     *
     * @note   Initializes the TD controller, if needed.
     * @param  io_sc    The step code data struct.
     * @return Non-SUCCESS if an internal function fails, SUCCESS otherwise.
     */
    uint32_t handleTdEvent( STEP_CODE_DATA_STRUCT & io_sc );

    /**
     * @brief  Bans TPS on the given rank. Any attempts to add a TPS procedure
     *         to the queue for this rank will be ignored.
     * @param  i_chip MCA, MBA, or OCMB chip.
     * @param  i_rank The target slave rank.
     */
    void banTps( ExtensibleChip * i_chip, const MemRank & i_rank )
    {
        // It doesn't matter what we set the value to, we just need to make sure
        // the rank exists in the map.
        iv_tpsBans[std::make_pair(i_chip, i_rank)] = true;
    }

    /**
     * @brief Handles reset-reload or FO scenario.
     *
     * This does not call initialize() or start any maintenance commands.
     * Instead, it checks the hardware's current state and ensures by the end of
     * the function that either a command is currently running or there will be
     * a command complete attention pending that PRD will handle separately.
     *
     * If there is already an active command complete attention. This function
     * does nothing because PRD will handle the attention soon.
     *
     * If there is no active command complete attention and there is no command
     * currently in progress, it will set the command complete attention and PRD
     * will handle that attention soon.
     *
     * Otherwise, there is a command in progress. So, it will check for any
     * unverified chip marks. If any exist, it will force the current command to
     * stop, causing a command complete attention that PRD will handle soon.
     *
     * @return Non-SUCCESS if an internal function fails, SUCCESS otherwise.
     */
    uint32_t handleRrFo();

    #endif

  private:

    /**
     * @brief  Initializes the TD controller, if needed.
     *
     * This should be called at the beginning of every public function to ensure
     * the TD controller is initialized.
     *
     * During MemDiags, this initializes iv_broadcastModeCapable.
     *
     * At runtime, this is used to query hardware for any unverified chip marks
     * that may have occurred after starting background scrubbing, but before
     * PRD is up and running. We may also have unverified chip marks if the HBRT
     * service is stopped and restarted (PRD is reinitialize and all previous
     * state machine data is lost).
     *
     * @return Non-SUCCESS if an internal function fails, SUCCESS otherwise.
     */
    uint32_t initialize();

    /**
     * @brief This is called when there are no more TD procedures to execute.
     *
     * During Memory Diagnostics, this means the current pattern test command
     * has reached the end of memory on the MBA or MCBIST. So this function will
     * tell MDIA to move onto the next pattern test command, if needed.
     *
     * At runtime, this function will restart background scrubbing.
     *
     * @param  io_sc The step code data struct.
     * @return Non-SUCCESS if an internal function fails, SUCCESS otherwise.
     */
    uint32_t defaultStep( STEP_CODE_DATA_STRUCT & io_sc );

    /**
     * @brief Progresses onto the next step of the state machine.
     *
     * This function will move onto the next step of the current procedure, if
     * one is in progress. Otherwise, it pops the next procedure off of the
     * queue, if one exists, and starts that procedure.
     *
     * @param  io_sc The step code data struct.
     * @return Non-SUCCESS if an internal function fails, SUCCESS otherwise.
     */
    uint32_t nextStep( STEP_CODE_DATA_STRUCT & io_sc )
    {
        uint32_t rc = SUCCESS;

        if ( nullptr == iv_curProcedure ) // Nothing currently in progress.
        {
            if ( iv_queue.empty() ) // No more TD procedures.
            {
                rc = defaultStep( io_sc );
            }
            else
            {
                // Get the next entry in the queue and move forward.
                iv_curProcedure = iv_queue.getNextEntry();
                rc = nextStep( io_sc );
            }
        }
        else
        {
            // Do the next step of the current procedure.
            bool done = false;
            rc = iv_curProcedure->nextStep( io_sc, done );
            if ( SUCCESS != rc )
            {
                // Something failed. Clean up the current command and stop.
                iv_curProcedure = nullptr; iv_queue.pop();
            }
            else if ( done )
            {
                // This procedure is done so clean it up and move on.
                iv_curProcedure = nullptr; iv_queue.pop();
                rc = nextStep( io_sc );
            }
        }

        return rc;
    }

    /**
     * @brief  This is called when handling a command complete attention for a
     *         non-TD command to initialize iv_stoppedRank then check for any
     *         ECC errors.
     * @param  o_errorsFound True if errors where found and handled. False
     *                       otherwise.
     * @param  io_sc         The step code data struct.
     * @return Non-SUCCESS if an internal function fails, SUCCESS otherwise.
     */
    uint32_t analyzeCmdComplete( bool & o_errorsFound,
                                 STEP_CODE_DATA_STRUCT & io_sc );

    /**
     * @brief Adds the TD controller state to the  capture data.
     * @param io_sc      The step code data struct.
     * @param i_startEnd Description tag for the capture data. Used to
     *        distinguish between data captured at the beginning or end of
     *        analysis.
     */
    void collectStateCaptureData( STEP_CODE_DATA_STRUCT & io_sc,
                                  const char * i_startEnd );

    #ifdef __HOSTBOOT_RUNTIME

    /**
     * @param  i_rank The target slave rank.
     * @return True, if this slave rank has been banned. False, otherwise.
     */
    bool isTpsBanned( ExtensibleChip * i_chip, const MemRank & i_rank )
    {
        // Check if this rank exists in the map.
        std::pair<ExtensibleChip *, MemRank> e = std::make_pair(i_chip, i_rank);
        return ( iv_tpsBans.end() != iv_tpsBans.find(e) );
    }

    /**
     * @brief  Masks NCE and TCE ECC attentions.
     * @note   Only intended to be used just before starting a new TD procedure.
     * @return Non-SUCCESS if an internal function fails, SUCCESS otherwise.
     */
    uint32_t maskEccAttns();

    /**
     * @brief  Clears and unmasks NCE and TCE ECC attentions.
     * @note   maskEccAttns() will not mask fetch UEs, however, this function
     *         will unmask them because it is possible that UEs exceeded
     *         threshold and were masked by the rule code.
     * @note   Only intended to be used just after completing a TD procedure.
     * @return Non-SUCCESS if an internal function fails, SUCCESS otherwise.
     */
    uint32_t unmaskEccAttns();

    /**
     * @param  o_canResume True, if background scrubbing can be resumed. False,
     *                     if a new background scrub command must be started.
     * @return Non-SUCCESS if an internal function fails, SUCCESS otherwise.
     */
    uint32_t canResumeBgScrub( bool & o_canResume );

    #endif

  private: // instance variables

    /** An MCBIST, MBA, or OCMB chip associated with this TD controller. */
    ExtensibleChip * const iv_chip;

    /** The TD queue that contains all of the pending TD procedures. */
    TdQueue iv_queue;

    /** The procedure that is currently in progress. */
    TdEntry * iv_curProcedure = nullptr;

    /** A list of all ranks behind iv_chip. */
    TdRankList<T> iv_rankList;

    /** If a non-TD command stopped somewhere in the middle of memory, PRD will
     *  need to restart that command on the next configured rank. This variable
     *  stores where the non-TD command stopped. The non-command will then be
     *  restarted on the next rank in defaultStep() after all targeted
     *  diagnostics are complete. */
    TdRankListEntry iv_stoppedRank;

    /** True if the TD controller has been initialized. False otherwise. */
    bool iv_initialized = false;

    #ifdef __HOSTBOOT_RUNTIME

    /** True if background scrubbing should be resumed after pausing on error.
     *  False if a TD procedure had been executed and background scrubbing needs
     *  to be restarted with a new command. */
    bool iv_resumeBgScrub = false;

    /** Map to keep track of ranks that have banned TPS. */
    std::map< std::pair<ExtensibleChip *, MemRank>, bool > iv_tpsBans;

    #else // IPL only

    /** Indicates if broadcast mode is capable on iv_chip. */
    bool iv_broadcastModeCapable = false;

    #endif

};

} // end namespace PRDF

#endif // __prdfMemTdCtlr_H

OpenPOWER on IntegriCloud