summaryrefslogtreecommitdiffstats
path: root/src/usr/diag/prdf/common/plat/p9/prdfP9TodPlugins.C
blob: 76468e6a3f2d891afc63ede7fe923e62f44d6c82 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
/* IBM_PROLOG_BEGIN_TAG                                                   */
/* This is an automatically generated prolog.                             */
/*                                                                        */
/* $Source: src/usr/diag/prdf/common/plat/p9/prdfP9TodPlugins.C $         */
/*                                                                        */
/* OpenPOWER HostBoot Project                                             */
/*                                                                        */
/* Contributors Listed Below - COPYRIGHT 2018                             */
/* [+] International Business Machines Corp.                              */
/*                                                                        */
/*                                                                        */
/* Licensed under the Apache License, Version 2.0 (the "License");        */
/* you may not use this file except in compliance with the License.       */
/* You may obtain a copy of the License at                                */
/*                                                                        */
/*     http://www.apache.org/licenses/LICENSE-2.0                         */
/*                                                                        */
/* Unless required by applicable law or agreed to in writing, software    */
/* distributed under the License is distributed on an "AS IS" BASIS,      */
/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or        */
/* implied. See the License for the specific language governing           */
/* permissions and limitations under the License.                         */
/*                                                                        */
/* IBM_PROLOG_END_TAG                                                     */
/**@file   prdfP9TodPlugins.C
 * @brief  defines all the TOD error plugins
 */

#include <prdfPluginDef.H>
#include <prdfPluginMap.H>
#include <prdfExtensibleChip.H>
#include <iipSystem.h>
#include <prdfP9ProcDomain.H>
#include <prdfGlobal_common.H>
#include <iipServiceDataCollector.h>
#include <prdfRegisterCache.H>
#include <UtilHash.H>
#include <algorithm>
#include <prdfPlatProcConst.H>

using namespace TARGETING;

namespace PRDF
{

using namespace PlatServices;
using namespace TOD;

/** @struct TodFaultData
 *  TOD Fault isolation information from a chip.
 */
struct TodFaultData
{
    TargetHandle_t chipReportingError; // target reporting tod error
    bool phypDetectedFault; // phyp detected a TOD fault on this chip
                            // (on either topology)
    bool isActiveMdmt;      // Chip is MDMT on active topology
    bool isBackupMdmt;      // MDMT on backup topology
    bool faultDetected[2];  // index 0 for fault on active topo, 1 for backup
    bool isMdmtAndFaulty[2];// chip is MDMT and has a fault on same topo
    bool activeTopologyIsPrimary; //topology selected as active
    TargetHandle_t chipSourcingClk[2];//if not MDMT, which chip is tod clk src
    uint32_t activeMasterPathPosition[2]; // Clock position providing the TOD
                                          // clock source to an MDMT

    /**
     *@brief  Constructor
     */
    explicit TodFaultData( TargetHandle_t i_procTgt ):
        chipReportingError( i_procTgt ),
        phypDetectedFault( false ),
        isActiveMdmt( false ),
        isBackupMdmt( false )
    {
        faultDetected[0]        =   false;
        faultDetected[1]        =   false;
        isMdmtAndFaulty[0]      =   false;
        isMdmtAndFaulty[1]      =   false;
        activeTopologyIsPrimary =   false;
        chipSourcingClk[0]      =   NULL;
        chipSourcingClk[1]      =   NULL;
        activeMasterPathPosition[0] = 0;
        activeMasterPathPosition[1] = 0;
    }
};

/** @struct TodFaultData
 * System TOD failover status
 */
struct TopologySwitchDetails
{
    bool masterPathHwFailOver; // hw failover status of master path
    bool phypSwitchedTopology; // topology switch status by Phyp

    /**
     * @brief Constructor
     */
    TopologySwitchDetails():
      masterPathHwFailOver( false ),
      phypSwitchedTopology( false )
    {}
};

namespace Proc
{
/**
 * @brief   Captures all the tod registers of all functional Proc chips.
 * @param   i_stepcode  The step code data struct
 * @return SUCCESS.
 */
int32_t todCaptureRegisters( STEP_CODE_DATA_STRUCT & i_stepcode )
{
    ProcDomain * l_procDomain =
        (ProcDomain*)systemPtr->GetDomain( PROC_DOMAIN );

    for( size_t i = 0; i < l_procDomain->GetSize(); i++ )
    {
        RuleChip * l_chip = l_procDomain->LookUp( i );
        l_chip->CaptureErrorData( i_stepcode.service_data->GetCaptureData(),
                            Util::hashString( "TODReg" ) );
    }
    return SUCCESS;
}

/**
 * @brief  Clears Tod errors register and Tod error bits in TP_LFIR
 * @param  i_stepcode  The step code data struct
 * @return Non-SUCCESS if an internal function fails, SUCCESS otherwise.
 */
int32_t todCleanUpErrors( STEP_CODE_DATA_STRUCT & i_stepcode )
{
    #define PRDF_FUNC "[Proc::todCleanUpErrors] "

    uint32_t o_rc = SUCCESS;

#ifdef __HOSTBOOT_RUNTIME
    ProcDomain * l_procDomain =
        (ProcDomain*)systemPtr->GetDomain( PROC_DOMAIN );

    for( size_t i = 0; i < l_procDomain->GetSize(); i++ )
    {
        int32_t l_rc = SUCCESS;
        RuleChip * l_procChip = l_procDomain->LookUp( i );

        // Clear bits 14,15,16,17,21,39 in TOD Error Register
        // Bits in this register are cleared by writing 1
        SCAN_COMM_REGISTER_CLASS * l_todError =
                    l_procChip->getRegister( "TOD_ERRORREGISTER" );

        l_rc = l_todError->Read();
        if ( SUCCESS != l_rc )
        {
            PRDF_ERR( PRDF_FUNC"Read() failed on TOD_ERRORREGISTER: "
                      "proc=0x%08x", l_procChip->GetId() );

            // Continue to try clearing the other chips
            o_rc = FAIL;
            continue;
        }

        uint64_t l_val = l_todError->GetBitFieldJustified( 0, 64 );
        l_val = l_val & 0x0003C40001000000ull; // bits 14,15,16,17,21,39

        if ( 0 != l_val )
        {
            l_todError->SetBitFieldJustified(  0, 64, l_val );
            l_rc = l_todError->Write();

            if ( SUCCESS != l_rc )
            {
                PRDF_ERR( PRDF_FUNC"Write() failed on TOD_ERRORREGISTER: "
                              "proc=0x%08x", l_procChip->GetId() );
                o_rc = FAIL;
                continue;
            }
        }


        // Next read shall cause Force Read
        RegDataCache & regCache = RegDataCache::getCachedRegisters();
        regCache.flush( l_procChip, l_todError );

        // Clear bits 18 and 20 in TPLFIR
        SCAN_COMM_REGISTER_CLASS * l_andTpFir =
                        l_procChip->getRegister( "TP_LFIR_AND" );

        l_andTpFir->setAllBits();
        l_andTpFir->ClearBit(18);
        l_andTpFir->ClearBit(20);

        l_rc = l_andTpFir->Write();
        if ( SUCCESS != l_rc )
        {
            PRDF_ERR( PRDF_FUNC"Write() failed on TP_LFIR_AND: "
                      "proc=0x%08x", l_procChip->GetId() );
            o_rc = FAIL;
            continue;
        }
    }
#endif
    return o_rc;

    #undef PRDF_FUNC
}

/**
 * @brief   Investigates if there is a failover initiated by HW.
 * @param   i_chip              chip reporting TOD errors
 * @param   io_faultData        Tod fault info
 * @param   o_failoverStatus    failover status
 */
void checkForHwInitiatedFailover( ExtensibleChip * i_chip,
                                  TodFaultData & io_faultData,
                                  TopologySwitchDetails & o_failoverStatus )
{
    #define PRDF_FUNC   "[Proc::checkForHwInitiatedFailover] "

    // This function detects whether an MDMT chip has switched its master path
    // due to a clock fault. In this case, PRD gets an attention due to a step
    // check error in Master Path 0. The failover modifies bit 12 of the TOD
    // status register. PRD finds that both active and backup topolgy use the
    // same master path (path 1). When PRD checks for faults on each topology
    // we'll be looking at path 1 for both and find no faults there. So this
    // function checks for the master patch failover case and marks the MDMT
    // chip at fault appropriately.

    do
    {
        if( false == io_faultData.isActiveMdmt  ||
            false == io_faultData.isBackupMdmt )
        {
            // don't consider slave procs for this check
            break;
        }

        // Is MDMT in a failover state.
        if(( false == io_faultData.isMdmtAndFaulty[0] &&
             false == io_faultData.isMdmtAndFaulty[1] ))

        {
            // Get TOD Error register.
            SCAN_COMM_REGISTER_CLASS * l_todError =
                        i_chip->getRegister("TOD_ERRORREGISTER");

            uint32_t l_oscPos = 1;

            if ( SUCCESS != l_todError->Read() )
            {
                PRDF_ERR( PRDF_FUNC"Read() failed on TOD_ERRORREGISTER: "
                      "i_chip=0x%08x", i_chip->GetId() );
                break;
            }

            if( l_todError->IsBitSet(14) )
            {
                l_oscPos = 0;
            }

            else if( !l_todError->IsBitSet(15))
            {
                break;
            }

            // We failed to capture a TOD error in master path. This implies
            // a HW path failover has occurred.
            o_failoverStatus.masterPathHwFailOver = true;

            uint32_t topPos =
                 ( true == o_failoverStatus.phypSwitchedTopology )? 1 : 0;

            io_faultData.faultDetected[topPos] = true;
            io_faultData.isMdmtAndFaulty[topPos] = true;
            io_faultData.activeMasterPathPosition[topPos] = l_oscPos;

            PRDF_TRAC( PRDF_FUNC "HW Initiated failover: MDMT 0x%08x "
                       "faulty, mpath pos: %d", i_chip->GetId(),
                       l_oscPos );
        }

    }while(0);

    #undef PRDF_FUNC
}

/**
 * @brief   Analyzes the TOD error of a given proc
 * @param   i_chip      chip reporting TOD errors
 * @param   o_faults    list of Tod fault info
 * @param   i_stepcode  The step code data struct
 * @param   io_failOverStatus   topology  failover status
 * @return  SUCCESS.
 */
int32_t todCollectFaultDataChip(  ExtensibleChip * i_chip,
                                  std::vector<TodFaultData> & o_faults,
                                  STEP_CODE_DATA_STRUCT & i_stepcode,
                                  TopologySwitchDetails & io_failOverStatus )
{
    #define PRDF_FUNC "[Proc::todCollectFaultDataChip] "

    TargetHandle_t l_chipTarget = i_chip->GetChipHandle();
    TodFaultData l_faultData ( l_chipTarget );

    uint32_t l_rc = FAIL;

    do
    {
        // Check if PHYP reported TOD error
        SCAN_COMM_REGISTER_CLASS * l_pTpLFir = i_chip->getRegister( "TP_LFIR" );

        l_rc = l_pTpLFir->Read();
        if ( SUCCESS != l_rc )
        {
            PRDF_ERR( PRDF_FUNC"Read() failed on TP_LFIR: i_chip=0x%08x",
                      i_chip->GetId() );
            break;
        }

        l_faultData.phypDetectedFault = l_pTpLFir->IsBitSet(20);

        // Deterimine active topology.
        SCAN_COMM_REGISTER_CLASS * l_todStatus =
                i_chip->getRegister("TOD_STATUSREGISTER");

        l_rc = l_todStatus->Read();
        if ( SUCCESS != l_rc )
        {
            PRDF_ERR( PRDF_FUNC"Read() failed on TOD_STATUSREGISTER: "
                      "i_chip=0x%08x", i_chip->GetId() );
            break;
        }

        //Reading TOD_STATUSREGISTER[0:2]
        //0b000 means configuration chosen is Primary
        //0b111 means configuration chosen is Secondary

        bool l_activeIsPrimary =
            ( 0 == l_todStatus->GetBitFieldJustified( 0, 3 ) );
        l_faultData.activeTopologyIsPrimary = l_activeIsPrimary;

        // Get TOD Error register.
        SCAN_COMM_REGISTER_CLASS * l_todError =
                        i_chip->getRegister("TOD_ERRORREGISTER");

        l_rc = l_todError->Read();
        if ( SUCCESS != l_rc )
        {
            PRDF_ERR( PRDF_FUNC"Read() failed on TOD_ERRORREGISTER: "
                      "i_chip=0x%08x", i_chip->GetId() );
            break;
        }

        // Check both topologies, active first.
        for ( int i = 0; i < 2; i++ )
        {
            // Each chip has 2 TOD topologies configured (primary and secondary)
            // One of these is selected as active topology and one as backup
            // In TodFaultData, index 0 is used for the active topology, and 1
            // for the backup. We also need to know whether we looking at the
            // primary or secondary topology, because that will determine
            // the bit positions we used in the TOD registers.
            // So within this for loop, index 0/1 refers to active/backup
            // l_topIsPri identifies whether the current topo was configured
            // in the primary or secondary position.

            bool l_topIsPri =
                    ( ( 0 == i ) ? l_activeIsPrimary : !l_activeIsPrimary );

            bool l_masterTodSelected = false ;
            bool l_masterDrawerSelected = false;

            // Check if MDMT on current topology.
            l_masterTodSelected =
                        l_todStatus->IsBitSet( l_topIsPri ? 13 : 17 );
            l_masterDrawerSelected =
                        l_todStatus->IsBitSet( l_topIsPri ? 14 : 18 );

            // Check master OSC status if MDMT
            if ( ( l_masterTodSelected ) && ( l_masterDrawerSelected ) )
            {
                // Deterimine which OSC card is used.
                bool l_osc0;    //means  master path 0
                bool l_oscFail;

                l_faultData.isActiveMdmt = l_todStatus->IsBitSet(23);
                l_faultData.isBackupMdmt = l_todStatus->IsBitSet(24);

                l_osc0 = !l_todStatus->IsBitSet( l_topIsPri ? 12 : 16 );
                l_faultData.activeMasterPathPosition[i] = l_osc0 ? 0 : 1;

                // Read step check error bit in TOD error register
                l_oscFail = l_todError->IsBitSet( l_osc0 ? 14 : 15 );

                if ( l_oscFail )
                {
                    // Set fault data.
                    l_faultData.faultDetected[i] = true;
                    l_faultData.isMdmtAndFaulty[i] = true;

                    PRDF_TRAC(PRDF_FUNC " MDMT: 0x%08x at Error, M-Path: %d, "
                              "topology: %c",
                              i_chip->GetId(), l_osc0 ? 0 : 1,
                              i == 0 ?'A':'B' );
                }

            }//if mdmt

            else // Is not MDMT on this topology.
            {
                // Deterimine whether slave chip is using Primary configuration
                // slave path (slave path 0 )or secondary configuration slave
                //path (slave path 1 )
                bool l_slv0 = !l_todStatus->IsBitSet( l_topIsPri ? 15 : 19 );

                // Check if TOD slave path has any step check error.
                // bit 16 and 21  of TOD_ERRORREGISTER indicate if there is any
                // TOD Error in slave path.

                bool l_slvErr = l_todError->IsBitSet( l_slv0 ? 16 : 21 );

                // If there is Step Check Error, we must determine proc sourcing
                // clock to  the chip reporting step check error. We do this by
                // reading PCRP0 for primary configuration and SCRP1 for
                // secondary configuration to determine which bus is being used
                // to transmit tod clock. We can use that to get the peer proc
                // at the other end of the bus.

                if ( l_slvErr )
                {
                    uint32_t l_connection = 0;
                    TargetHandle_t l_procClockSrc = NULL;

                    uint32_t l_ret = FAIL;
#ifdef __HOSTBOOT_RUNTIME
                    l_ret = getTodPortControlReg( l_chipTarget, l_slv0,
                                                  l_connection );
#endif
                    if( SUCCESS != l_ret ) continue;

                    // The connection value is in bits 0:2. The scomdef doesn't
                    // define this very well:
                    //    X0_PORT_0=>0b000
                    //    X1_PORT_0=>0b001
                    //    X2_PORT_0=>0b010
                    //    X3_PORT_0=>0b011
                    //    X4_PORT_0=>0b100
                    //    X5_PORT_0=>0b101
                    //    X6_PORT_0=>0b110
                    //    X7_PORT_0=>0b111
                    // I've been told the actual definition is 0-2 for XBUS0-2
                    // 3-6 for OBUS0-3, port 7 unused.

                    l_connection >>= 29;
                    if ( l_connection > 6 )
                    {
                        PRDF_ERR( PRDF_FUNC"Configuration error for 0x%08x "
                                  "connection 0x%08x", getHuid(l_chipTarget),
                                  l_connection );
                        continue;
                    }
                    else
                    {
                        TYPE l_busType = TYPE_XBUS;
                        if ( l_connection > 2 )
                        {
                            l_busType = TYPE_OBUS;
                            l_connection -= 3;
                        }

                        l_procClockSrc = getConnectedPeerProc( l_chipTarget,
                                                              l_busType,
                                                              l_connection );
                    }

                    if( NULL == l_procClockSrc )
                    {
                        l_procClockSrc = l_chipTarget;
                    }

                    // Set fault data.
                    l_faultData.faultDetected[i] = true;
                    l_faultData.chipSourcingClk[i] = l_procClockSrc;

                    PRDF_TRAC( PRDF_FUNC " Slave 0x%08x at Error S-Path %d,"
                               "topology %c,  clk source is 0x%08x",
                               i_chip->GetId(), l_slv0 ? 0:1,
                               i == 0 ? 'A':'B',
                               getHuid( l_procClockSrc ) );

                } // error in slave
            }//else not mdmt
        }//for topology

        checkForHwInitiatedFailover( i_chip, l_faultData, io_failOverStatus );

        // Check for an internal path error in active topology
        uint32_t topPos = io_failOverStatus.phypSwitchedTopology ? 1 : 0;
        if ( !l_faultData.faultDetected[topPos]  && l_todError->IsBitSet(17) )
        {
            l_faultData.faultDetected[topPos] = true;
            l_faultData.chipSourcingClk[topPos] = l_chipTarget;
        }

        o_faults.push_back( l_faultData );

        l_rc = SUCCESS;

    } while(0);

    return l_rc;

    #undef PRDF_FUNC
}

/**
 * @brief   Collects TOD fault error info for all procs in the system
 * @param   i_chip      chip reporting TOD errors
 * @param   i_stepcode  The step code data struct
 * @param   io_FailoverStatus   hw initiated failover status
 */
void todCollectFaultDataSys( std::vector<TodFaultData> & o_faults,
                             STEP_CODE_DATA_STRUCT & i_stepcode,
                             TopologySwitchDetails & io_FailoverStatus )
{
    ProcDomain * l_procDomain =
        (ProcDomain*)systemPtr->GetDomain( PROC_DOMAIN );

    for( size_t i = 0; i < l_procDomain->GetSize(); i++ )
    {
        RuleChip * l_chip = l_procDomain->LookUp( i );
        uint32_t l_rc = todCollectFaultDataChip( l_chip, o_faults,
                                                 i_stepcode,
                                                 io_FailoverStatus );
        if( SUCCESS != l_rc )
        {
            PRDF_ERR("[todCollectFaultDataSys] Failed to analyze tod errors in"
                     "chip 0x%08x",l_chip->GetId() );
        }

    }
}

/**
 * @brief   Determines if Phyp switched the topology.
 * @return  o_topologySwitch   topology switch status
 */
bool  checkPhypSwitchedTopology(  )
{
    #define PRDF_FUNC "[checkPhypSwitchedTopology] "

    bool o_topologySwitch = false;

    ProcDomain * l_procDomain =
        (ProcDomain*)systemPtr->GetDomain( PROC_DOMAIN );

    for( size_t i = 0; i < l_procDomain->GetSize(); i++ )
    {
        RuleChip * l_chip = l_procDomain->LookUp( i );
        // Get TOD Error register.
        SCAN_COMM_REGISTER_CLASS * l_todError =
                        l_chip->getRegister("TOD_ERRORREGISTER");

        if( SUCCESS != l_todError->Read() )
        {
            PRDF_ERR( PRDF_FUNC"Read  failed for tod  error "
                     "register on 0x%08x", l_chip->GetId() );
            break;
        }

        o_topologySwitch = l_todError->IsBitSet(39);

        if( true == o_topologySwitch )
        {
            break;
        }
    }

    return o_topologySwitch;
    #undef PRDF_FUNC
}

/**
 * @brief Collects FFDC associated with step errors.
 * @param io_todErrorData    contains fault status and data for all chips.
 * @param i_failOverstatus  contains master path and topology failover data.
 * @param o_errorSummary    contains FFDC associated with step errors.
 */
void collectTodErrorFfdc(   std::vector<TodFaultData> & io_todErrorData,
                            TopologySwitchDetails i_failOverstatus,
                            TodErrorSummary & o_errorSummary )
{
    std::vector<TodFaultData> faultyChip;
    memset( &o_errorSummary, 0x00, sizeof(TodErrorSummary) );

    for ( auto & i : io_todErrorData )
    {
        if ( i.phypDetectedFault )
        {
            o_errorSummary.phypDetectedTodError = 1;
        }

        if( i.isActiveMdmt )
        {
            o_errorSummary.activeMdmt = getHuid( i.chipReportingError );
            o_errorSummary.activeTopology =
                                        i.activeTopologyIsPrimary ? 1 : 0;
            // master path position selected for active MDMT
            o_errorSummary.activeTopologyMastPath =
                                        i.activeMasterPathPosition[0];
        }

        if( i.isBackupMdmt )
        {
            o_errorSummary.backUpMdmt = getHuid( i.chipReportingError );
            // master path position selected for backup MDMT
            o_errorSummary.backUpTopologyMastPath =
                                            i.activeMasterPathPosition[1];
        }

        // Add to list if some error is detected.
        if ( i.phypDetectedFault || i.faultDetected[0] ||
             i.faultDetected[1] )
        {
            faultyChip.push_back( i );
        }
    }
    o_errorSummary.topologySwitchByPhyp =
                    i_failOverstatus.phypSwitchedTopology ? 1 :0 ;

    o_errorSummary.hardwareSwitchFlip =
                    i_failOverstatus.masterPathHwFailOver ? 1 : 0;
    o_errorSummary.reserved = 0;

    io_todErrorData.empty();
    io_todErrorData = faultyChip;
}

/**
 * @brief Adds FFDC associated with step error as Capture data.
 * @param i_stepcode        Step Code Data Struct.
 * @param i_chip            Chip reporting TOD step error.
 * @param i_errorSummary    contains FFDC associated with step error.
 */
void addFfdcToCaptureData(  ExtensibleChip * i_chip,
                            STEP_CODE_DATA_STRUCT & i_stepcode,
                            TodErrorSummary & i_errorSummary )
{
    size_t sz_w = sizeof(CPU_WORD);
    size_t sz_t =
            ((sizeof(TodErrorSummary) + sz_w - 1) / sz_w ) * sz_w;
    uint8_t errorDataBuff[sz_t];
    memset( &errorDataBuff, 0x00, sz_t );
    memcpy( &errorDataBuff, &i_errorSummary, sizeof(TodErrorSummary) );

    #if( __BYTE_ORDER == __LITTLE_ENDIAN )

    for( uint32_t i = 0; i < sz_t / sz_w; i++ )
    {
        ((CPU_WORD *)errorDataBuff)[i] =
                        htonl(( (CPU_WORD *) errorDataBuff)[i]);
    }

    #endif

    BitString  bs( sz_t * 8, (CPU_WORD *) & errorDataBuff );

    CaptureData & cd = i_stepcode.service_data->GetCaptureData();
    cd.Add( i_chip->GetChipHandle(), Util::hashString("TOD_ERROR_DATA"), bs );
}

/**
 * @brief   Analyzes the step check error of all procs in the system
 * @param   i_chip      chip reporting TOD errors
 * @param   i_stepcode  The step code data struct
 * @return  SUCCESS.
 */
int32_t todStepCheckFault( ExtensibleChip * i_chip,
                           STEP_CODE_DATA_STRUCT & i_stepcode )
{
    #define PRDF_FUNC "[Proc::todStepCheckFault] "

    // When we analyze a step check fault, we will look at all chips in the
    // system--both topologies. After we've collected TOD fault data on each
    // chip, we will categorize the failure as:
    //   - MDMT Clock problem
    //   - Internal path error
    //   - Connection error between chips
    // In case of connection error,we try to minimize the list of chips to the
    // list of most probable chips causing TOD errors. Once all the chips at
    // fault are isolated, hwsv is requested to create a new back up topology.

    // Collect TOD registers for FFDC.
    todCaptureRegisters( i_stepcode );

    // Collect TOD fault data.
    std::vector<TodFaultData> l_faultData;

    // List of chips for HWSV to avoid when constructing a new backup topo
    std::vector< TargetHandle_t > l_chipBlackList;

    // Osc for HWSV to avoid when constructing a new backup topology
    // Since HB doesn't model osc targets, we need a proc and Osc position
    TargetHandle_t procOscTgtBl = nullptr; // Proc target assoc with bad Osc
    uint32_t oscPosBl = 0xFFFFFFFF; // Osc position relative to proc

    TopologySwitchDetails failOverstatus;
    failOverstatus.phypSwitchedTopology = checkPhypSwitchedTopology( );
    todCollectFaultDataSys( l_faultData, i_stepcode, failOverstatus );
    TodErrorSummary todErrorFfdc;
    collectTodErrorFfdc( l_faultData, failOverstatus, todErrorFfdc );

    bool l_phypError = false;
    TargetHandle_t mdmtList[2] = {NULL, NULL };
    uint8_t mdmtFailedOscPos[2] = {0xFF, 0xFF};
    uint8_t analysisSummary[2] = { NO_TOD_ERROR, NO_TOD_ERROR };
    bool l_allInternal = true;
    bool l_foundFault = false;

    // Find MDMT chips at fault
    for ( std::vector<TodFaultData>::iterator i = l_faultData.begin();
          i != l_faultData.end(); i++ )
    {
        if ( i->phypDetectedFault )
        {
            l_phypError = true;
        }

        for ( int t = 0; t < 2; t++ )
        {
            if( i->isMdmtAndFaulty[t] )
            {
                mdmtList[t] = i->chipReportingError;
                mdmtFailedOscPos[t] = i->activeMasterPathPosition[t];
            }
        }
    }

    if ( l_phypError )
    {
        i_stepcode.service_data->SetThresholdMaskId(0);
    }

    // Look at both topologies.
    for ( int i = 0; i < 2; i++ )
    {
        // Classifications of topology errors:
        // 1) MDMT clock problem - callout clock or MDMT.
        // 2) Internals only - callout chips.
        // 3) Network error - clear internals, and isolate.

        // MDMT analysis

        if( NULL != mdmtList[i] )
        {
            // HW initiated failover. Callout the failed OSC.
            if ( failOverstatus.masterPathHwFailOver )
            {
                i_stepcode.service_data->SetThresholdMaskId(0);
            }
            // Add Osc to blacklist
            procOscTgtBl = mdmtList[i];
            oscPosBl = mdmtFailedOscPos[i];

            // Add Proc to blacklist
            l_chipBlackList.push_back( mdmtList[i] );

            // Callout and gard TOD OSC
#ifdef __HOSTBOOT_MODULE
            errlHndl_t errl =
                ServiceGeneratorClass::ThisServiceGenerator().getErrl();
            if ( NULL == errl )
            {
                PRDF_ERR( PRDF_FUNC "Failed to get the global error log" );
                break;
            }
            errl->addClockCallout( mdmtList[i], HWAS::TODCLK_TYPE,
                                   HWAS::SRCI_PRIORITY_HIGH,
                                   HWAS::DECONFIG,
                                   HWAS::GARD_Predictive );
#else
            TargetHandle_t l_clockTarget = nullptr;
            l_clockTarget = getConnectedChild(  procOscTgtBl,
                                                TYPE_TODCLK,
                                                oscPosBl );
            if (l_clockTarget)
                i_stepcode.service_data->SetCallout( l_clockTarget, MRU_HIGH );
#endif
            // Callout MDMT chip
            i_stepcode.service_data->SetCallout(mdmtList[i], MRU_MEDA );

            //callout a symbolic FRU to replace FRU/interfaces between Proc and
            //TOD OSC card
            i_stepcode.service_data->SetCallout( TOD_CLOCK_ERR, MRU_MED,
                                                 NO_GARD );
            analysisSummary[i] = MASTER_PATH_ERROR;

            // We have analyzed this topology to an MDMT fault, move on to the
            // backup topology
            continue;
        }

        // Collect some information for further classification
        for ( std::vector<TodFaultData>::iterator j = l_faultData.begin();
                j != l_faultData.end(); j++ )
        {
            // If fault on topology.
            if ( j->faultDetected[i] )
            {
                l_foundFault = true;

                // Check if non-internal fault.
                if( j->chipSourcingClk[i] != j->chipReportingError )
                {
                    // ignore internal path errors during hw failover.
                    l_allInternal = false;
                }
            }
        }

        // Skip analysis if this topology has nothing.
        if ( !l_foundFault )
        {
            continue;
        }

        if ( l_allInternal ) // Internal callouts.
        {

            for ( std::vector<TodFaultData>::iterator j = l_faultData.begin();
                    j != l_faultData.end(); j++ )
            {
                if ( j->chipSourcingClk[i] == j->chipReportingError )
                {

                    if ( NULL != j->chipReportingError )
                    {
                        // update consolidated callout list and
                        //black list for internal path errors
                        i_stepcode.service_data->SetCallout(
                                        j->chipReportingError,MRU_MED );
                        l_chipBlackList.push_back( j->chipReportingError );
                    }
                }
            }

            analysisSummary[i] = INTERNAL_PATH_ERROR;
        }
        else // Network callout.
        {
            // Clear all internal reports and get chips.
            for ( std::vector<TodFaultData>::iterator j = l_faultData.begin();
                    j != l_faultData.end(); j++ )
            {
                if ( j->chipSourcingClk[i] == j->chipReportingError )
                {
                    j->faultDetected[i] = false;
                }
            }

            TargetHandleList l_rootList;
            std::vector<TodFaultData>::iterator itSrc;

            for( itSrc = l_faultData.begin(); itSrc != l_faultData.end();
                 itSrc++ )
            {
                std::vector<TodFaultData>::iterator itReport;
                bool l_badSrc = false;

                if( !itSrc->faultDetected[i] )
                    continue;

                for( itReport = l_faultData.begin();
                     itReport != l_faultData.end();
                     itReport++ )
                {
                    // If proc A is getting its tod clock from proc B and both
                    // are reporting step check errors, we callout only B.
                    if(  itSrc->chipSourcingClk[i] ==
                         itReport->chipReportingError )
                    {
                        if ( true == itReport->faultDetected[i] )
                        {
                            l_badSrc = true;
                            l_rootList.push_back(itReport->chipReportingError);

                            PRDF_TRAC( PRDF_FUNC "Network callout adding clk"
                                       "source chip 0x%08x topology %c",
                                       getHuid(itReport->chipReportingError ),
                                       i == 0 ? 'A':'B' );
                        }
                        break;
                    }
                }

                if( !l_badSrc )
                {
                    l_rootList.push_back( itSrc->chipReportingError );
                    PRDF_TRAC( PRDF_FUNC "Network callout adding chip 0x%08x "
                               "i = %c", getHuid( itSrc->chipReportingError ),
                                i == 0 ? 'A':'B' );
                }
            }

            // Sort, remove unique.
            std::sort( l_rootList.begin(), l_rootList.end() );
            std::vector<TargetHandle_t>::iterator itChip;
            itChip = std::unique(l_rootList.begin(), l_rootList.end());
            l_rootList.erase( itChip,l_rootList.end() );

            //Calling out the final list of chips reporting connection
            //problem in  TOD network.
            for ( auto &failedChip : l_rootList )
            {
                // update the consolidated callout list and
                // black list for hwsv
                i_stepcode.service_data->SetCallout( failedChip, MRU_MED );
                l_chipBlackList.push_back( failedChip );
            } //for l_rootList

            analysisSummary[i] = SLAVE_PATH_NETWORK_ERROR;

        }// else network error

    }//for topology

    std::sort( l_chipBlackList.begin(), l_chipBlackList.end() );
    std::vector<TargetHandle_t>::iterator itBlackList;
    itBlackList = std::unique( l_chipBlackList.begin(), l_chipBlackList.end());
    l_chipBlackList.erase( itBlackList, l_chipBlackList.end() );

    // Now we call HWSV to create a new backup topology. The chips in the black
    // list will not be selected as the new MDMT.
#ifdef __HOSTBOOT_RUNTIME
    todErrorFfdc.topologyResetRequested = 0;
    if ( i_stepcode.service_data->IsAtThreshold() )
    {
        requestNewTODTopology( oscPosBl, procOscTgtBl,
                               l_chipBlackList, !l_phypError );
        todErrorFfdc.topologyResetRequested = 1;
    }
#endif

    // If we never made a callout, call out this chip.
    if ( 0 == i_stepcode.service_data->getMruListSize() )
    {
        i_stepcode.service_data->SetCallout( i_chip->GetChipHandle() );
        analysisSummary[0] = UNKNOWN_TOD_ERROR;
        analysisSummary[1] = UNKNOWN_TOD_ERROR;
    }

    // Clean up all TOD error reports.
    if ( SUCCESS != todCleanUpErrors( i_stepcode ) )
    {
        PRDF_ERR(PRDF_FUNC "Failed to clear TOD Errors of the"
                 "System" );
    }

    for( auto &blChip : l_chipBlackList )
    {
        PRDF_TRAC( PRDF_FUNC"black list chip HUID: 0x%08x ",
                   getHuid( blChip ) );
    }

    if (procOscTgtBl)
    {
        PRDF_TRAC( PRDF_FUNC "black list osc chip HUID 0x%08x Pos %d",
                   getHuid(procOscTgtBl), oscPosBl );
    }

    // At last, add FFDC as capture data to error log
    todErrorFfdc.activeTopologySummary = analysisSummary[0];
    todErrorFfdc.backUpTopologySummary = analysisSummary[1];
    addFfdcToCaptureData( i_chip, i_stepcode, todErrorFfdc );

    return SUCCESS;

    #undef PRDF_FUNC
}
PRDF_PLUGIN_DEFINE_NS( p9_nimbus,  Proc, todStepCheckFault );
PRDF_PLUGIN_DEFINE_NS( p9_cumulus, Proc, todStepCheckFault );

/**
 * @brief   Request for creation of a new back up topology.
 * @param   i_chip      chip reporting TOD errors
 * @param   i_stepcode  The step code data struct
 * @return  SUCCESS.
 */
int32_t todNewTopologyIfBackupMDMT( ExtensibleChip * i_chip,
                                    STEP_CODE_DATA_STRUCT & i_stepcode )
{
#ifdef __HOSTBOOT_RUNTIME
    do
    {
        SCAN_COMM_REGISTER_CLASS * l_todStatus =
                        i_chip->getRegister( "TOD_STATUSREGISTER" );

        if( SUCCESS != l_todStatus->Read( ) )
        {
            PRDF_ERR("[todNewTopologyIfBackupMDMT] Failed to read TOD status"
                     "register, address 0x%16llx of proc 0x%08x ",
                     l_todStatus->GetAddress(),i_chip->GetId() );
            break;
        }

        bool primaryIsActive = !( 0 == l_todStatus->GetBitFieldJustified( 0,3 ) );

        /* Check this chips role
         * Topology - 1
         *
         * TOD_STATUS[13]   TOD_STATUS[14]          Inference
         *      1               1                   Mster TOD Master Drawer
         *      0               1                   Slave TOD Master Drawer
         *      0               0                   Slave TOD Slave Drawer
         *      1               0                   Master TOD Slave Drawer

         * Topology - 2
         * TOD_STATUS[17]   TOD_STATUS[18]  Inference
         *
         *   Truth Table is same as above
         */

        // Check for MDMT status.
        bool l_masterTodSelect;
        bool l_masterDrawerSelect;
        l_masterTodSelect = l_todStatus->IsBitSet(
                                       13 + ( primaryIsActive ? 0 : 4 ) );
        l_masterDrawerSelect = l_todStatus->IsBitSet(
                                       14 + ( primaryIsActive ? 0 : 4 ) );

        // If this is the MDMT then request a new topology.
        if( ( l_masterTodSelect ) && ( l_masterDrawerSelect ) )
        {
            TargetHandleList badChipList;
            badChipList.push_back( i_chip->GetChipHandle() );
            requestNewTODTopology( 0xFFFFFFFF, nullptr, badChipList, false );
        }

    } while(0);
#endif
    return SUCCESS;
}
PRDF_PLUGIN_DEFINE_NS( p9_nimbus,  Proc, todNewTopologyIfBackupMDMT );
PRDF_PLUGIN_DEFINE_NS( p9_cumulus, Proc, todNewTopologyIfBackupMDMT );


/**
 * @brief   Requests for a toplogy switch in response to logic parity error.
 * @param   i_chip      chip reporting TOD logic parity error.
 * @param   i_stepcode  The step code data struct
 * @return  SUCCESS.
 */
int32_t requestTopologySwitch( ExtensibleChip * i_chip,
                               STEP_CODE_DATA_STRUCT & i_stepcode )
{
#ifdef __HOSTBOOT_RUNTIME
    if ( i_stepcode.service_data->IsAtThreshold() )
    {
        // Reconfigure the TOD topology and let PHYP know when backup is good.
        TargetHandleList badChipList;
        badChipList.push_back( i_chip->GetChipHandle( ) );
        requestNewTODTopology( 0xFFFFFFFF, nullptr, badChipList, true );
    }
#endif
    return SUCCESS;
}
PRDF_PLUGIN_DEFINE_NS( p9_nimbus,  Proc, requestTopologySwitch );
PRDF_PLUGIN_DEFINE_NS( p9_cumulus, Proc, requestTopologySwitch );

/**
 * @brief   Checks if TOD error analysis is disabled on platform.
 * @param   i_chip      chip reporting TOD error.
 * @param   i_stepcode  The step code data struct.
 * @return  SUCCESS  if TOD analysis is disabled
 */
int32_t isTodDisabled( ExtensibleChip * i_chip,
                       STEP_CODE_DATA_STRUCT & i_stepcode )
{
    int32_t o_rc = SUCCESS;

    if ( isHyprConfigOpal() )
    {
        // On OPAL machine, mask TOD errors on first instance. There
        // should not be any service action.
        i_stepcode.service_data->setFlag( ServiceDataCollector::AT_THRESHOLD );
        i_stepcode.service_data->clearServiceCall();
        o_rc = SUCCESS; // TOD fault analysis not supported
    }
    else if ( isHyprRunning() && isHyprConfigPhyp() &&
              !isMfgAvpEnabled() && !isMfgHdatAvpEnabled() )
    {
        o_rc = FAIL; // TOD Fault analysis is supported
    }
    else
    {
        i_stepcode.service_data->SetCallout( LEVEL2_SUPPORT, MRU_MED, NO_GARD );
        i_stepcode.service_data->SetCallout( SP_CODE, MRU_MED, NO_GARD );
        o_rc =  SUCCESS; // TOD fault analysis not supported
    }

    return o_rc;
}
PRDF_PLUGIN_DEFINE_NS( p9_nimbus,  Proc, isTodDisabled );
PRDF_PLUGIN_DEFINE_NS( p9_cumulus, Proc, isTodDisabled );

} //namespace Proc ends

} //namespace PRDF ends
OpenPOWER on IntegriCloud