summaryrefslogtreecommitdiffstats
path: root/doc/error-logging.txt
blob: a29d36842f20d94c5c842f4ca8266ad357499c39 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
How to log errors on Sapphire and POWERNV:
=========================================

Currently the errors reported by POWERNV/Sapphire (OPAL) interfaces
are in free form, where as errors reported by FSP is in standard Platform
Error Log (PEL) format. For out-of band management via IPMI interfaces,
it is necessary to push down the errors to FSP via mailbox
(reported by POWERNV/Sapphire) in PEL format.

PEL size can vary from 2K-16K bytes, fields of which needs to populated
based on the kind of event and error that needs to be reported.
All the information needed to be reported as part of the error, is
passed by user using the error-logging interfaces outlined below.
Following which, PEL structure is generated based on the input and
then passed on to FSP.

Error logging interfaces in Sapphire:
====================================

Interfaces are provided for the user to log/report an error in Sapphire.
Using these interfaces relevant error information is collected and later
converted to PEL format and then pushed to FSP.

Step 1: To report an error, invoke opal_elog_create() with required argument.

	struct opal_errorlog *opal_elog_create(int reason_code);

	Each error/event that needs to be reported should do so with its
	unique 32 bit reason code/SRC. Based on this SRC, relevant information
	around that error/event is gathered from look-up table and updated
	into the error log buffer.

	Parameters:

	 int reason_code: Reason for failure as stated in include/fsp-elog.h
				for Sapphire
			Eg: Reason code for code-update failures can be
				OPAL_RC_CU_INIT  -> Initialisation failure
				OPAL_RC_CU_FLASH -> Flash failure

	Following info is gathered from the look-up table in fsp-elog_write.c
	and is pre-defined for a given error.

	uint8_t opal_error_event_type: Classification of error/events
					type reported on OPAL
		/* Platform Events/Errors: Report Machine Check Interrupt */
		#define OPAL_PLATFORM_ERR_EVT           0x01
		/* INPUT_OUTPUT: Report all I/O related events/errors */
		#define OPAL_INPUT_OUTPUT_ERR_EVT       0x02
		/* RESOURCE_DEALLOC: Hotplug events and errors */
		#define OPAL_RESOURCE_DEALLOC_ERR_EVT   0x03
		/* MISC: Miscellanous error */
		#define OPAL_MISC_ERR_EVT               0x04

	uint16_t component_id: Component ID of Sapphire component as
				listed in include/fsp-elog.h

	uint8_t subsystem_id: ID of the sub-system reporting error.
		/* OPAL Subsystem IDs listed for reporting events/errors */
			#define OPAL_PROCESSOR_SUBSYSTEM        0x10
			#define OPAL_MEMORY_SUBSYSTEM           0x20
			#define OPAL_IO_SUBSYSTEM               0x30
			#define OPAL_IO_DEVICES                 0x40
			#define OPAL_CEC_HARDWARE               0x50
			#define OPAL_POWER_COOLING              0x60
			#define OPAL_MISC                       0x70
			#define OPAL_SURVEILLANCE_ERR           0x7A
			#define OPAL_PLATFORM_FIRMWARE          0x80
			#define OPAL_SOFTWARE                   0x90
			#define OPAL_EXTERNAL_ENV               0xA0

	uint8_t event_severity: Severity of the event/error to be reported
		#define OPAL_INFO                                   0x00
		#define OPAL_RECOVERED_ERR_GENERAL                  0x10

		/* 0x2X series is to denote set of Predictive Error */
		/* 0x20 Generic predictive error */
		#define OPAL_PREDICTIVE_ERR_GENERAL                         0x20
		/* 0x21 Predictive error, degraded performance */
		#define OPAL_PREDICTIVE_ERR_DEGRADED_PERF                   0x21
		/* 0x22 Predictive error, fault may be corrected after reboot */
		#define OPAL_PREDICTIVE_ERR_FAULT_RECTIFY_REBOOT            0x22
		/*
		 * 0x23 Predictive error, fault may be corrected after reboot,
		 * degraded performance
		 */
		#define OPAL_PREDICTIVE_ERR_FAULT_RECTIFY_BOOT_DEGRADE_PERF 0x23
		/* 0x24 Predictive error, loss of redundancy */
		#define OPAL_PREDICTIVE_ERR_LOSS_OF_REDUNDANCY              0x24

		/* 0x4X series for Unrecoverable Error */
		/* 0x40 Generic Unrecoverable error */
		#define OPAL_UNRECOVERABLE_ERR_GENERAL                      0x40
		/* 0x41 Unrecoverable error bypassed with degraded performance */
		#define OPAL_UNRECOVERABLE_ERR_DEGRADE_PERF                 0x41
		/* 0x44 Unrecoverable error bypassed with loss of redundancy */
		#define OPAL_UNRECOVERABLE_ERR_LOSS_REDUNDANCY              0x44
		/* 0x45 Unrecoverable error bypassed with loss of redundancy and performance */
		#define OPAL_UNRECOVERABLE_ERR_LOSS_REDUNDANCY_PERF         0x45
		/* 0x48 Unrecoverable error bypassed with loss of function */
		#define OPAL_UNRECOVERABLE_ERR_LOSS_OF_FUNCTION             0x48

		#define OPAL_ERROR_PANIC				    0x50

	uint8_t  event_subtype: Event Sub-type
			#define OPAL_NA                                         0x00
			#define OPAL_MISCELLANEOUS_INFO_ONLY                    0x01
			#define OPAL_PREV_REPORTED_ERR_RECTIFIED                0x10
			#define OPAL_SYS_RESOURCES_DECONFIG_BY_USER             0x20
			#define OPAL_SYS_RESOURCE_DECONFIG_PRIOR_ERR            0x21
			#define OPAL_RESOURCE_DEALLOC_EVENT_NOTIFY              0x22
			#define OPAL_CONCURRENT_MAINTENANCE_EVENT               0x40
			#define OPAL_CAPACITY_UPGRADE_EVENT                     0x60
			#define OPAL_RESOURCE_SPARING_EVENT                     0x70
			#define OPAL_DYNAMIC_RECONFIG_EVENT                     0x80
			#define OPAL_NORMAL_SYS_PLATFORM_SHUTDOWN               0xD0
			#define OPAL_ABNORMAL_POWER_OFF                         0xE0

	uint8_t opal_srctype: SRC type, value should be OPAL_SRC_TYPE_ERROR.
			SRC refers to System Reference Code.
			It is 4 byte hexa-decimal number that reflects the
			current system state.
			Eg: BB821010,
				1st byte -> BB -> SRC Type
				2nd byte -> 82 -> Subsystem
				3rd, 4th byte -> Component ID and Reason Code
			SRC needs to be generated on the fly depending on the state
			of the system. All the parameters needed to generate a SRC
			should be provided during reporting of an event/error.


	 uint32_t reason_code: Reason for failure as stated in include/fsp-elog.h
				for Sapphire
			Eg: Reason code for code-update failures can be
				OPAL_RC_CU_INIT  -> Initialisation failure
				OPAL_RC_CU_FLASH -> Flash failure


Step 2: Multiple extended user dumps can be appened to error log
	using the below interface.

	int opal_elog_update_user_dump(struct opal_errorlog *buf, unsigned char *data,
                                               uint32_t tag, uint16_t size)

	Parameters:
	struct opal_errorlog *buf:
		struct opal_errorlog *buf: struct opal_errorlog pointer returned
		by opal_elog_create() call.

	unsigned char *data: Pointer to the dump data

	uint32_t tag: Unique value to identify the data.
		      Ideal to have ASCII value for 4-byte string.

	uint16_t size: Size of the dump data.

Step 3: Once all the data for an error is logged in, the error needs to be
	committed in FSP.

	rc = elog_fsp_commit(buf);
	Value of 0 is returned on success.

In the process of committing an error to FSP, log info is first internally
converted to PEL format and then pushed to the FSP. All the errors logged
in Sapphire are again pushed up to POWERNV platform by the FSP and all the errors
reported by Sapphire and POWERNV are logged in FSP.

If the user does not intend to dump various user data sections, but just
log the error with some amount of description around that error, theb can do
so using just the simple error logging interface

log_simple_error(uint32_t reason_code, char *fmt, ...);

Eg: log_simple_error(OPAL_RC_SURVE_STATUS,
			"SURV: Error retreiving surveillance status: %d\n",
                       						err_len);

Using the reason code, an error log is generated with the information derived
from the look-up table, populated and committed to FSP. All of it
is done with just one call.

Note:
====
* For more information regarding error logging and PEL format
  refer to PAPR doc and P7 PEL and SRC PLDD document.

* Refer to include/opal.h for all the error logging
  interface parameters and include/fsp-pel.h for PEL
  structures.

Sample error logging:
===================
void report_error(int index)
{
        struct opal_errorlog *buf;
        int rc;
        char data1[] = "This is a sample user defined data section1";
        char data2[] = "Error logging sample. These are dummy errors. Section 2";
        char data3[] = "Sample error Sample error Sample error Sample error \
                                 Sample error abcdefghijklmnopqrstuvwxyz";
        int tag;

        printf("ELOG: In machine check report error index: %d\n", index);

        /* To report an error, create an error log with relevant information
	 * struct opal_errorlog *opal_elog_create(int reason_code);
         * Call returns a pre-allocated buffer of type 'struct opal_errorlog'
         * buffer with relevant fields updated.
         */

        buf = opal_elog_create(OPAL_RC_CHIP_MASTER);
        if (buf == NULL) {
                printf("ELOG: Error getting buffer.\n");
                return;
        }

        /* In case of user wanting to add multiple sections of various dump data
         * for better debug, data sections can be added using this interface
         * int opal_elog_update_user_dump(struct opal_errorlog *buf, unsigned char *data,
         *                                          uint32_t tag, uint16_t size)
         */
        /* tag -> unqiue ascii tag to identify a particular data dump section */
        tag = 0x4b4b4b4b;
        rc =  opal_elog_update_user_dump(buf, data1, tag, sizeof(data1));
        printf("ELOG: User data updated. rc : %d \n", rc);

        tag = 0x4c4c4c4c;
        rc =  opal_elog_update_user_dump(buf, data2, tag, sizeof(data2));
        printf("ELOG: User data updated. rc : %d \n", rc);

        tag = 0x4d4d4d4d;
        rc =  opal_elog_update_user_dump(buf, data3, tag, sizeof(data3));
        printf("ELOG: User data updated. rc : %d \n", rc);

        /* Once all info is updated, ready to be sent to FSP */
        printf("ELOG:commit to FSP\n");
        rc = elog_fsp_commit(buf);
        if (rc != 0)
                printf("ELOG: Re-try error logging\n");
}

 Sample output PEL dump got from FSP:
 ===================================
 $ errl -d -x 0x53D5EA83
 |   00000000     50480030  01004000  20131126  05064700     PH.0..@. .....G.   |
 |   00000010     20131126  05064790  4B000109  00000000      .....G.K.......   |
 |   00000020     00000000  00000000  B0000003  53D5EA83     ............S...   |
 |   00000030     55480018  01004000  20000000  00000000     UH....@. .......   |
 |   00000040     00002000  01005300  50530050  01004000     .. ...S.PS.P..@.   |
 |   00000050     02000008  00000048  00000080  00000000     .......H........   |
 |   00000060     00000000  00000000  01234567  22220222     .........#Eg""."   |
 |   00000070     34560123  98768920  42423832  34303132     4V.#.v. BB824012   |
 |   00000080     20202020  20202020  20202020  20202020                        |
 |   00000090     20202020  20202020  4548004C  01004000             EH.L..@.   |
 |   000000A0     38323436  2D4C3243  30363033  37374100     8246-L2C060377A.   |
 |   000000B0     00000000  00000000  00000000  00000000     ................   |
 |   000000C0     00000000  00000000  00000000  00000000     ................   |
 |   000000D0     00000000  00000000  00000000  05064700     ..............G.   |
 |   000000E0     00000000  4D54001C  01004000  38323436     ....MT....@.8246   |
 |   000000F0     2D4C3243  30363033  37374100  00000000     -L2C060377A.....   |
 |   00000100     5544003C  01004000  4B4B4B4B  00340000     UD....@.KKKK.4..   |
 |   00000110     54686973  20697320  61207361  6D706C65     This is a sample   |
 |   00000120     20757365  72206465  66696E65  64206461      user defined da   |
 |   00000130     74612073  65637469  6F6E3100  55440048     ta section1.UD.H   |
 |   00000140     01004000  4C4C4C4C  00400000  4572726F     ..@.LLLL.@..Erro   |
 |   00000150     72206C6F  6767696E  67207361  6D706C65     r logging sample   |
 |   00000160     2E205468  65736520  61726520  64756D6D     . These are dumm   |
 |   00000170     79206572  726F7273  2E205365  6374696F     y errors. Sectio   |
 |   00000180     6E203200  55440071  01004000  4D4D4D4D     n 2.UD.q..@.MMMM   |
 |   00000190     00690000  53616D70  6C652065  72726F72     .i..Sample error   |
 |   000001A0     2053616D  706C6520  6572726F  72205361      Sample error Sa   |
 |   000001B0     6D706C65  20657272  6F722053  616D706C     mple error Sampl   |
 |   000001C0     65206572  726F7220  09090909  2053616D     e error .... Sam   |
 |   000001D0     706C6520  6572726F  72206162  63646566     ple error abcdef   |
 |   000001E0     6768696A  6B6C6D6E  6F707172  73747576     ghijklmnopqrstuv   |
 |   000001F0     7778797A  00                               wxyz.              |
 |------------------------------------------------------------------------------|
 |                       Platform Event Log - 0x53D5EA83                        |
 |------------------------------------------------------------------------------|
 |                                Private Header                                |
 |------------------------------------------------------------------------------|
 | Section Version          : 1                                                 |
 | Sub-section type         : 0                                                 |
 | Created by               : 4000                                              |
 | Created at               : 11/26/2013 05:06:47                               |
 | Committed at             : 11/26/2013 05:06:47                               |
 | Creator Subsystem        : Unknown - 0x0000004B                              |
 | CSSVER                   :                                                   |
 | Platform Log Id          : 0xB0000003                                        |
 | Entry Id                 : 0x53D5EA83                                        |
 | Total Log Size           : 644                                               |
 |------------------------------------------------------------------------------|
 |                                 User Header                                  |
 |------------------------------------------------------------------------------|
 | Section Version          : 1                                                 |
 | Sub-section type         : 0                                                 |
 | Log Committed by         : 4000                                              |
 | Subsystem                : Memory Subsystem                                  |
 | Event Scope              : Unknown - 0x00000000                              |
 | Event Severity           : Informational Event                               |
 | Event Type               : Not Applicable                                    |
 | Return Code              : 0x00000000                                        |
 | Action Flags             : Report to Operating System                        |
 | Action Status            : Sent to Hypervisor                                |
 |------------------------------------------------------------------------------|
 |                        Primary System Reference Code                         |
 |------------------------------------------------------------------------------|
 | Section Version          : 1                                                 |
 | Sub-section type         : 0                                                 |
 | Created by               : 4000                                              |
 | SRC Format               : 0x80                                              |
 | SRC Version              : 0x02                                              |
 | Virtual Progress SRC     : False                                             |
 | I5/OS Service Event Bit  : False                                             |
 | Hypervisor Dump Initiated: False                                             |
 | Power Control Net Fault  : False                                             |
 |                                                                              |
 | Valid Word Count         : 0x08                                              |
 | Reference Code           : BB824012                                          |
 | Hex Words 2 - 5          : 00000080 00000000 00000000 00000000               |
 | Hex Words 6 - 9          : 01234567 22220222 34560123 98768920               |
 |                                                                              |
 |------------------------------------------------------------------------------|
 |                             Extended User Header                             |
 |------------------------------------------------------------------------------|
 | Section Version          : 1                                                 |
 | Sub-section type         : 0                                                 |
 | Created by               : 4000                                              |
 | Reporting Machine Type   : 8246-L2C                                          |
 | Reporting Serial Number  : 060377A                                           |
 | FW Released Ver          :                                                   |
 | FW SubSys Version        :                                                   |
 | Common Ref Time          : 00/00/0000 05:06:47                               |
 | Symptom Id Len           : 0                                                 |
 | Symptom Id               :                                                   |
 |------------------------------------------------------------------------------|
 |                      Machine Type/Model & Serial Number                      |
 |------------------------------------------------------------------------------|
 | Section Version          : 1                                                 |
 | Sub-section type         : 0                                                 |
 | Created by               : 4000                                              |
 | Machine Type Model       : 8246-L2C                                          |
 | Serial Number            : 060377A                                           |
 |------------------------------------------------------------------------------|
 |                              User Defined Data                               |
 |------------------------------------------------------------------------------|
 | Section Version          : 1                                                 |
 | Sub-section type         : 0                                                 |
 | Created by               : 4000                                              |
 |                                                                              |
 |   00000000     4B4B4B4B  00340000  54686973  20697320     KKKK.4..This is    |
 |   00000010     61207361  6D706C65  20757365  72206465     a sample user de   |
 |   00000020     66696E65  64206461  74612073  65637469     fined data secti   |
 |   00000030     6F6E3100                                   on1.               |
 |                                                                              |
 |------------------------------------------------------------------------------|
 |                              User Defined Data                               |
 |------------------------------------------------------------------------------|
 | Section Version          : 1                                                 |
 | Sub-section type         : 0                                                 |
 | Created by               : 4000                                              |
 |                                                                              |
 |   00000000     4C4C4C4C  00400000  4572726F  72206C6F     LLLL.@..Error lo   |
 |   00000010     6767696E  67207361  6D706C65  2E205468     gging sample. Th   |
 |   00000020     65736520  61726520  64756D6D  79206572     ese are dummy er   |
 |   00000030     726F7273  2E205365  6374696F  6E203200     rors. Section 2.   |
 |                                                                              |
 |------------------------------------------------------------------------------|
 |                              User Defined Data                               |
 |------------------------------------------------------------------------------|
 | Section Version          : 1                                                 |
 | Sub-section type         : 0                                                 |
 | Created by               : 4000                                              |
 |                                                                              |
 |   00000000     4D4D4D4D  00690000  53616D70  6C652065     MMMM.i..Sample e   |
 |   00000010     72726F72  2053616D  706C6520  6572726F     rror Sample erro   |
 |   00000020     72205361  6D706C65  20657272  6F722053     r Sample error S   |
 |   00000030     616D706C  65206572  726F7220  09090909     ample error ....   |
 |   00000040     2053616D  706C6520  6572726F  72206162      Sample error ab   |
 |   00000050     63646566  6768696A  6B6C6D6E  6F707172     cdefghijklmnopqr   |
 |   00000060     73747576  7778797A  00                     stuvwxyz.          |
 |                                                                              |
 |------------------------------------------------------------------------------|

OpenPOWER on IntegriCloud