Discussion:
[PATCH v2 1/4] can: dev: Consolidate and unify state change handling.
Andri Yngvason
2014-09-26 18:25:24 UTC
Permalink
The handling of can error states is different between platforms.
This is an attempt to correct that problem.

I've moved this handling into a generic function for changing the
error state. This ensures that error state changes are handled
the same way everywhere (where this function is used).

What's changed from the last version of this patch-set is that
can_change_state() now relies on the individual states of the rx/tx
counters rather than their individual count values.

Signed-off-by: Andri Yngvason <***@marel.com>
---
drivers/net/can/dev.c | 106 +++++++++++++++++++++++++++++++++++++++++
include/linux/can/dev.h | 5 ++
include/uapi/linux/can/error.h | 1 +
3 files changed, 112 insertions(+)

diff --git a/drivers/net/can/dev.c b/drivers/net/can/dev.c
index 02492d2..f2ad15e 100644
--- a/drivers/net/can/dev.c
+++ b/drivers/net/can/dev.c
@@ -273,6 +273,112 @@ static int can_get_bittiming(struct net_device *dev,
struct can_bittiming *bt,
return err;
}

+static void can_update_error_counters(struct net_device *dev,
+ enum can_state new_state)
+{
+ struct can_priv *priv = netdev_priv(dev);
+
+ if (new_state <= priv->state)
+ return;
+
+ switch (new_state) {
+ case CAN_STATE_ERROR_ACTIVE:
+ netdev_warn(dev, "%s: did we come from a state less than error-active?",
+ __func__);
+ break;
+ case CAN_STATE_ERROR_WARNING:
+ priv->can_stats.error_warning++;
+ break;
+ case CAN_STATE_ERROR_PASSIVE:
+ priv->can_stats.error_passive++;
+ break;
+ case CAN_STATE_BUS_OFF:
+ priv->can_stats.bus_off++;
+ break;
+ default:
+ netdev_warn(dev, "%s: %d is not a state", __func__, new_state);
+ break;
+ };
+}
+
+static int can_txstate_to_frame(struct net_device *dev, enum can_state state)
+{
+ switch (state) {
+ case CAN_STATE_ERROR_ACTIVE:
+ return CAN_ERR_CRTL_ACTIVE;
+ case CAN_STATE_ERROR_WARNING:
+ return CAN_ERR_CRTL_TX_WARNING;
+ case CAN_STATE_ERROR_PASSIVE:
+ return CAN_ERR_CRTL_TX_PASSIVE;
+ case CAN_STATE_BUS_OFF:
+ netdev_warn(dev, "%s: bus-off is not handled here", __func__);
+ return 0;
+ default:
+ netdev_warn(dev, "%s: %d is not a state", __func__, state);
+ return 0;
+ }
+}
+
+static int can_rxstate_to_frame(struct net_device *dev, enum can_state state)
+{
+ switch (state) {
+ case CAN_STATE_ERROR_ACTIVE:
+ return CAN_ERR_CRTL_ACTIVE;
+ case CAN_STATE_ERROR_WARNING:
+ return CAN_ERR_CRTL_RX_WARNING;
+ case CAN_STATE_ERROR_PASSIVE:
+ return CAN_ERR_CRTL_RX_PASSIVE;
+ case CAN_STATE_BUS_OFF:
+ netdev_warn(dev, "%s: bus-off is not handled here", __func__);
+ return 0;
+ default:
+ netdev_warn(dev, "%s: %d is not a state", __func__, state);
+ return 0;
+ }
+}
+
+void can_change_state(struct net_device *dev, struct can_frame *cf,
+ enum can_state new_state, enum can_state tx_state,
+ enum can_state rx_state)
+{
+ struct can_priv *priv = netdev_priv(dev);
+
+ if (unlikely(new_state == priv->state)) {
+ netdev_warn(dev, "%s: oops, state did not change", __func__);
+ return;
+ }
+
+ can_update_error_counters(dev, new_state);
+
+ if (unlikely(new_state == CAN_STATE_BUS_OFF)) {
+ cf->can_id |= CAN_ERR_BUSOFF;
+ } else {
+ cf->can_id |= CAN_ERR_CRTL;
+ if (tx_state > rx_state)
+ cf->data[1] |= can_txstate_to_frame(dev, tx_state);
+ else if (tx_state < rx_state)
+ cf->data[1] |= can_rxstate_to_frame(dev, rx_state);
+ else
+ cf->data[1] |= can_txstate_to_frame(dev, tx_state)
+ | can_rxstate_to_frame(dev, rx_state);
+ }
+
+ priv->state = new_state;
+}
+EXPORT_SYMBOL_GPL(can_change_state);
+
+enum can_state can_errcnt_to_state(unsigned int count)
+{
+ if (unlikely(count > 127))
+ return CAN_STATE_ERROR_PASSIVE;
+
+ if (unlikely(count > 96))
+ return CAN_STATE_ERROR_WARNING;
+
+ return CAN_STATE_ERROR_ACTIVE;
+}
+EXPORT_SYMBOL_GPL(can_errcnt_to_state);
+
/*
* Local echo of CAN messages
*
diff --git a/include/linux/can/dev.h b/include/linux/can/dev.h
index 6992afc..6505208 100644
--- a/include/linux/can/dev.h
+++ b/include/linux/can/dev.h
@@ -121,6 +121,11 @@ void unregister_candev(struct net_device *dev);
int can_restart_now(struct net_device *dev);
void can_bus_off(struct net_device *dev);

+void can_change_state(struct net_device *dev, struct can_frame *cf,
+ enum can_state new_state, enum can_state tx_state,
+ enum can_state rx_state);
+enum can_state can_errcnt_to_state(unsigned int count);
+
void can_put_echo_skb(struct sk_buff *skb, struct net_device *dev,
unsigned int idx);
unsigned int can_get_echo_skb(struct net_device *dev, unsigned int idx);
diff --git a/include/uapi/linux/can/error.h b/include/uapi/linux/can/error.h
index c247446..1c508be 100644
--- a/include/uapi/linux/can/error.h
+++ b/include/uapi/linux/can/error.h
@@ -71,6 +71,7 @@
#define CAN_ERR_CRTL_TX_PASSIVE 0x20 /* reached error passive status TX */
/* (at least one error counter exceeds */
/* the protocol-defined level of 127) */
+#define CAN_ERR_CRTL_ACTIVE 0x40 /* recovered to error active state */

/* error in CAN protocol (type) / data[2] */
#define CAN_ERR_PROT_UNSPEC 0x00 /* unspecified */
--
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe linux-can" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Wolfgang Grandegger
2014-10-20 19:45:46 UTC
Permalink
Hi Andri,

sorry for the long delay...
Post by Andri Yngvason
The handling of can error states is different between platforms.
This is an attempt to correct that problem.
I've moved this handling into a generic function for changing the
error state. This ensures that error state changes are handled
the same way everywhere (where this function is used).
What's changed from the last version of this patch-set is that
can_change_state() now relies on the individual states of the rx/tx
counters rather than their individual count values.
To see if the state changes occur as expected could you please record
error message traces with candump for the following two scenarios:

1. send messages with cangen
disconnect the cable
reconnect the cable after a while until the error active state is
reached.

2. set restart-ms=100
send messages with cangen
provoke a bus-off short-circuiting CAN low and high
remove the short-circuit

Start with the SJA100 first.

Thanks,

Wolfgang.

--
To unsubscribe from this list: send the line "unsubscribe linux-can" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Andri Yngvason
2014-10-21 10:42:27 UTC
Permalink
Post by Wolfgang Grandegger
Hi Andri,
sorry for the long delay...
That's all right. I've been quite busy myself.
Post by Wolfgang Grandegger
Post by Andri Yngvason
The handling of can error states is different between platforms.
This is an attempt to correct that problem.
I've moved this handling into a generic function for changing the
error state. This ensures that error state changes are handled
the same way everywhere (where this function is used).
What's changed from the last version of this patch-set is that
can_change_state() now relies on the individual states of the rx/tx
counters rather than their individual count values.
To see if the state changes occur as expected could you please record
1. send messages with cangen
disconnect the cable
reconnect the cable after a while until the error active state is
reached.
I tried this the other day with flexcan. If there is nothing else
happening on the bus, the error state will not go back down, regardless
of whether the patches are applied or not. However, this does work if
another device is also sending on the bus.

Sadly, I did not save the logs.
Post by Wolfgang Grandegger
2. set restart-ms=3D100
send messages with cangen
provoke a bus-off short-circuiting CAN low and high
remove the short-circuit
That actually worked as expected. It's supposed to restart the bus, rig=
ht?
Post by Wolfgang Grandegger
Start with the SJA100 first.
I'll try to find some time for these experiments.

Thanks,
Andri
--
To unsubscribe from this list: send the line "unsubscribe linux-can" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Wolfgang Grandegger
2014-10-21 10:52:37 UTC
Permalink
On Tue, 21 Oct 2014 10:42:27 +0000, Andri Yngvason
Post by Andri Yngvason
Post by Wolfgang Grandegger
Hi Andri,
sorry for the long delay...
That's all right. I've been quite busy myself.
Post by Wolfgang Grandegger
Post by Andri Yngvason
The handling of can error states is different between platforms.
This is an attempt to correct that problem.
I've moved this handling into a generic function for changing the
error state. This ensures that error state changes are handled
the same way everywhere (where this function is used).
What's changed from the last version of this patch-set is that
can_change_state() now relies on the individual states of the rx/tx
counters rather than their individual count values.
To see if the state changes occur as expected could you please recor=
d
Post by Andri Yngvason
Post by Wolfgang Grandegger
1. send messages with cangen
disconnect the cable
reconnect the cable after a while until the error active state is
reached.
I tried this the other day with flexcan. If there is nothing else
happening on the bus, the error state will not go back down, regardle=
ss
Post by Andri Yngvason
of whether the patches are applied or not. However, this does work if
another device is also sending on the bus.
If the device continues to send (because cangen is still running), the
error counter should decrease. Don't know if the Flexcan does it right
but it works like that on the SJA1000.
Post by Andri Yngvason
=20
Sadly, I did not save the logs.
Post by Wolfgang Grandegger
2. set restart-ms=3D100
send messages with cangen
provoke a bus-off short-circuiting CAN low and high
remove the short-circuit
That actually worked as expected. It's supposed to restart the bus,
right?

The bus will automatically be restarted after 100 ms. The interesting p=
art
is
the flow of the state changes (error messages).
Post by Andri Yngvason
Post by Wolfgang Grandegger
Start with the SJA100 first.
I'll try to find some time for these experiments.
Thanks,

Wolfgang.
--
To unsubscribe from this list: send the line "unsubscribe linux-can" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Andri Yngvason
2014-10-21 14:56:47 UTC
Permalink
Post by Wolfgang Grandegger
On Tue, 21 Oct 2014 10:42:27 +0000, Andri Yngvason
Post by Andri Yngvason
Post by Wolfgang Grandegger
Hi Andri,
sorry for the long delay...
That's all right. I've been quite busy myself.
Post by Wolfgang Grandegger
Post by Andri Yngvason
The handling of can error states is different between platforms.
This is an attempt to correct that problem.
I've moved this handling into a generic function for changing the
error state. This ensures that error state changes are handled
the same way everywhere (where this function is used).
What's changed from the last version of this patch-set is that
can_change_state() now relies on the individual states of the rx/t=
x
Post by Wolfgang Grandegger
Post by Andri Yngvason
Post by Wolfgang Grandegger
Post by Andri Yngvason
counters rather than their individual count values.
To see if the state changes occur as expected could you please reco=
rd
Post by Wolfgang Grandegger
Post by Andri Yngvason
Post by Wolfgang Grandegger
1. send messages with cangen
disconnect the cable
reconnect the cable after a while until the error active state i=
s
Post by Wolfgang Grandegger
Post by Andri Yngvason
Post by Wolfgang Grandegger
reached.
I tried this the other day with flexcan. If there is nothing else
happening on the bus, the error state will not go back down, regardl=
ess
Post by Wolfgang Grandegger
Post by Andri Yngvason
of whether the patches are applied or not. However, this does work i=
f
Post by Wolfgang Grandegger
Post by Andri Yngvason
another device is also sending on the bus.
If the device continues to send (because cangen is still running), th=
e
Post by Wolfgang Grandegger
error counter should decrease. Don't know if the Flexcan does it righ=
t
Post by Wolfgang Grandegger
but it works like that on the SJA1000.
It turns out that I can't get this to work on sja1000 either without se=
nding
from another device.

Also, I must not have tested this properly because:
***@x86-20140911-072109:~# candump can1,0~0,#FFFFFFFF | grep -v 200000=
02

can1 20000004 [8] 00 40 00 00 00 00 00 60 ERRORFRAME
can1 20000004 [8] 00 20 00 00 00 00 00 80 ERRORFRAME
can1 20000004 [8] 00 40 00 00 00 00 00 5F ERRORFRAME


The problem here is :
+enum can_state can_errcnt_to_state(unsigned int count)
+{
+ if (unlikely(count > 127))
+ return CAN_STATE_ERROR_PASSIVE;
+
+ if (unlikely(count > 96))
+ return CAN_STATE_ERROR_WARNING;
+
+ return CAN_STATE_ERROR_ACTIVE;
+}

96 aught to be 95, or perhaps it would make more sense to make the chec=
ks count
Post by Wolfgang Grandegger
=3D 128 and count >=3D 96, respectively.
Maybe generate some error message if neither rx or tx states are greate=
r than
new_state?

I'd written unit tests for this but I guess they don't really protect y=
ou
against "false predicate" errors.
Post by Wolfgang Grandegger
Post by Andri Yngvason
Sadly, I did not save the logs.
Post by Wolfgang Grandegger
2. set restart-ms=3D100
send messages with cangen
provoke a bus-off short-circuiting CAN low and high
remove the short-circuit
That actually worked as expected. It's supposed to restart the bus,
right?
The bus will automatically be restarted after 100 ms. The interesting=
part
Post by Wolfgang Grandegger
is
the flow of the state changes (error messages).
***@x86-20140911-072109:~# candump can1,0~0,#FFFFFFFF | grep -v 200000=
02
can1 20000004 [8] 00 20 00 00 00 00 00 88 ERRORFRAME
can1 20000004 [8] 00 20 00 00 00 00 00 88 ERRORFRAME
can1 20000040 [8] 00 00 00 00 00 00 00 7F ERRORFRAME
can1 20000100 [8] 00 00 00 00 00 00 00 00 ERRORFRAME
can1 20000004 [8] 00 20 00 00 00 00 00 88 ERRORFRAME
can1 20000004 [8] 00 20 00 00 00 00 00 88 ERRORFRAME
can1 20000040 [8] 00 00 00 00 00 00 00 7F ERRORFRAME
can1 20000100 [8] 00 00 00 00 00 00 00 00 ERRORFRAME
can1 20000004 [8] 00 20 00 00 00 00 00 88 ERRORFRAME
can1 20000004 [8] 00 20 00 00 00 00 00 88 ERRORFRAME
can1 20000040 [8] 00 00 00 00 00 00 00 7F ERRORFRAME
can1 20000100 [8] 00 00 00 00 00 00 00 00 ERRORFRAME
etc...

Here we get error-passive twice in a row. The error count is the same i=
n both
cases though....

I just noticed that I swapped tx and rx in data[6..7]. I'll have to fix=
that too.
Post by Wolfgang Grandegger
Post by Andri Yngvason
Post by Wolfgang Grandegger
Start with the SJA100 first.
I'll try to find some time for these experiments.
Best regards,
Andri
--
To unsubscribe from this list: send the line "unsubscribe linux-can" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Wolfgang Grandegger
2014-10-21 15:21:35 UTC
Permalink
On Tue, 21 Oct 2014 14:56:47 +0000, Andri Yngvason
Post by Andri Yngvason
Post by Wolfgang Grandegger
On Tue, 21 Oct 2014 10:42:27 +0000, Andri Yngvason
Post by Andri Yngvason
Post by Wolfgang Grandegger
Hi Andri,
sorry for the long delay...
That's all right. I've been quite busy myself.
Post by Wolfgang Grandegger
Post by Andri Yngvason
The handling of can error states is different between platforms.
This is an attempt to correct that problem.
I've moved this handling into a generic function for changing the
error state. This ensures that error state changes are handled
the same way everywhere (where this function is used).
What's changed from the last version of this patch-set is that
can_change_state() now relies on the individual states of the rx/=
tx
Post by Andri Yngvason
Post by Wolfgang Grandegger
Post by Andri Yngvason
Post by Wolfgang Grandegger
Post by Andri Yngvason
counters rather than their individual count values.
To see if the state changes occur as expected could you please rec=
ord
Post by Andri Yngvason
Post by Wolfgang Grandegger
Post by Andri Yngvason
Post by Wolfgang Grandegger
1. send messages with cangen
disconnect the cable
reconnect the cable after a while until the error active state =
is
Post by Andri Yngvason
Post by Wolfgang Grandegger
Post by Andri Yngvason
Post by Wolfgang Grandegger
reached.
I tried this the other day with flexcan. If there is nothing else
happening on the bus, the error state will not go back down,
regardless
Post by Andri Yngvason
Post by Wolfgang Grandegger
Post by Andri Yngvason
of whether the patches are applied or not. However, this does work =
if
Post by Andri Yngvason
Post by Wolfgang Grandegger
Post by Andri Yngvason
another device is also sending on the bus.
If the device continues to send (because cangen is still running), t=
he
Post by Andri Yngvason
Post by Wolfgang Grandegger
error counter should decrease. Don't know if the Flexcan does it rig=
ht
Post by Andri Yngvason
Post by Wolfgang Grandegger
but it works like that on the SJA1000.
It turns out that I can't get this to work on sja1000 either without
sending
from another device.
=20
20000002
Post by Andri Yngvason
=20
can1 20000004 [8] 00 40 00 00 00 00 00 60 ERRORFRAME
can1 20000004 [8] 00 20 00 00 00 00 00 80 ERRORFRAME
can1 20000004 [8] 00 40 00 00 00 00 00 5F ERRORFRAME
=20
=20
+enum can_state can_errcnt_to_state(unsigned int count)
+{
+ if (unlikely(count > 127))
+ return CAN_STATE_ERROR_PASSIVE;
+
+ if (unlikely(count > 96))
+ return CAN_STATE_ERROR_WARNING;
+
+ return CAN_STATE_ERROR_ACTIVE;
+}
=20
96 aught to be 95, or perhaps it would make more sense to make the
checks
Post by Andri Yngvason
count
Post by Wolfgang Grandegger
=3D 128 and count >=3D 96, respectively.
=20
Maybe generate some error message if neither rx or tx states are grea=
ter
Post by Andri Yngvason
than
new_state?
This is how it should look like (from my old commits):

Here is an example output of "candump -td -e any,0:0,#FFFFFFFF" for
a recovery from error passive state due to no ack/cable (reconnect
after 5s) for a SJA1000 on an on EMS PCI card:

(000.201913) can0 1C [0]
(000.212241) can0 20000204 [8] 00 08 00 00 00 00 60 00 ERRORFRAME
controller-problem{tx-error-warning}
state-change{tx-error-warning}
error-counter-tx-rx{{96}{0}}
(000.003544) can0 20000204 [8] 00 20 00 00 00 00 80 00 ERRORFRAME
controller-problem{tx-error-passive}
state-change{tx-error-passive}
error-counter-tx-rx{{128}{0}}
(004.901842) can0 1D [7] 1D F6 33 52 31 4B DE
(000.000116) can0 20000200 [8] 00 08 00 00 00 00 7F 00 ERRORFRAME
state-change{tx-error-warning}
error-counter-tx-rx{{127}{0}}
(000.000678) can0 1E [6] 42 05 14 82 23 B6
...
(000.201927) can0 49 [4] 2F 1A 97 25
(000.000096) can0 20000200 [8] 00 40 00 00 00 00 5F 00 ERRORFRAME
state-change{back-to-error-active}
error-counter-tx-rx{{95}{0}}
(000.202184) can0 4A [8] 7F 87 0E FE 03 BA 78 91

Messages are generated with "cangen", also while the cable is
disconnected.
Post by Andri Yngvason
I'd written unit tests for this but I guess they don't really protect
you
Post by Andri Yngvason
against "false predicate" errors.
Post by Wolfgang Grandegger
Post by Andri Yngvason
Sadly, I did not save the logs.
Post by Wolfgang Grandegger
2. set restart-ms=3D100
send messages with cangen
provoke a bus-off short-circuiting CAN low and high
remove the short-circuit
That actually worked as expected. It's supposed to restart the bus,
right?
The bus will automatically be restarted after 100 ms. The interestin=
g
Post by Andri Yngvason
Post by Wolfgang Grandegger
part
is
the flow of the state changes (error messages).
20000002
Post by Andri Yngvason
can1 20000004 [8] 00 20 00 00 00 00 00 88 ERRORFRAME
can1 20000004 [8] 00 20 00 00 00 00 00 88 ERRORFRAME
can1 20000040 [8] 00 00 00 00 00 00 00 7F ERRORFRAME
can1 20000100 [8] 00 00 00 00 00 00 00 00 ERRORFRAME
can1 20000004 [8] 00 20 00 00 00 00 00 88 ERRORFRAME
can1 20000004 [8] 00 20 00 00 00 00 00 88 ERRORFRAME
can1 20000040 [8] 00 00 00 00 00 00 00 7F ERRORFRAME
can1 20000100 [8] 00 00 00 00 00 00 00 00 ERRORFRAME
can1 20000004 [8] 00 20 00 00 00 00 00 88 ERRORFRAME
can1 20000004 [8] 00 20 00 00 00 00 00 88 ERRORFRAME
can1 20000040 [8] 00 00 00 00 00 00 00 7F ERRORFRAME
can1 20000100 [8] 00 00 00 00 00 00 00 00 ERRORFRAME
etc...
=20
Here we get error-passive twice in a row. The error count is the same=
in
Post by Andri Yngvason
both
cases though....
Why twice? The option "-td" would be useful as well.

Wolfgang.
--
To unsubscribe from this list: send the line "unsubscribe linux-can" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Andri Yngvason
2014-10-22 11:48:26 UTC
Permalink
To see if the state changes occur as expected could you please re=
cord
error message traces with candump for the following two scenarios=
1. send messages with cangen
disconnect the cable
reconnect the cable after a while until the error active state=
is
reached.
After cleaning up my mess, this is the output for the disconnected cabl=
e test:
(000.000000) can1 20000004 [8] 00 08 00 00 00 00 60 00 ERRORFRA=
ME
controller-problem{tx-error-warning}
error-counter-tx-rx{{96}{0}}
(000.003953) can1 20000004 [8] 00 20 00 00 00 00 80 00 ERRORFRA=
ME
controller-problem{tx-error-passive}
error-counter-tx-rx{{128}{0}}
(005.959170) can1 20000002 [8] 03 00 00 00 00 00 00 00 ERRORFRA=
ME
lost-arbitration{at bit 3}
(000.428328) can1 20000004 [8] 00 40 00 00 00 00 5F 00 ERRORFRA=
ME
controller-problem{back-to-error-active}
error-counter-tx-rx{{95}{0}}
Post by Wolfgang Grandegger
2. set restart-ms=3D100
send messages with cangen
provoke a bus-off short-circuiting CAN low and high
remove the short-circuit
Shorting the can bus yields a loop like this:
(044.014170) can1 20000004 [8] 00 20 00 00 00 00 88 00 ERRORFRA=
ME
controller-problem{tx-error-passive}
error-counter-tx-rx{{136}{0}}
(000.003175) can1 20000040 [8] 00 00 00 00 00 00 7F 00 ERRORFRA=
ME
bus-off
error-counter-tx-rx{{127}{0}}
(000.099664) can1 20000100 [8] 00 00 00 00 00 00 00 00 ERRORFRA=
ME
restarted-after-bus-off
(000.097246) can1 20000004 [8] 00 20 00 00 00 00 88 00 ERRORFRA=
ME
controller-problem{tx-error-passive}
error-counter-tx-rx{{136}{0}}
(000.003160) can1 20000040 [8] 00 00 00 00 00 00 7F 00 ERRORFRA=
ME
bus-off
error-counter-tx-rx{{127}{0}}
(000.099602) can1 20000100 [8] 00 00 00 00 00 00 00 00 ERRORFRA=
ME
restarted-after-bus-off

Cheers,
Andri
--
To unsubscribe from this list: send the line "unsubscribe linux-can" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Wolfgang Grandegger
2014-10-22 12:30:07 UTC
Permalink
On Wed, 22 Oct 2014 11:48:26 +0000, Andri Yngvason
Post by Wolfgang Grandegger
To see if the state changes occur as expected could you please record
error message traces with candump for the following two scenario=
1. send messages with cangen
disconnect the cable
reconnect the cable after a while until the error active stat=
e
is
Post by Wolfgang Grandegger
reached.
After cleaning up my mess, this is the output for the disconnected ca=
ble
(000.000000) can1 20000004 [8] 00 08 00 00 00 00 60 00 =20
ERRORFRAME
controller-problem{tx-error-warning}
error-counter-tx-rx{{96}{0}}
(000.003953) can1 20000004 [8] 00 20 00 00 00 00 80 00 =20
ERRORFRAME
controller-problem{tx-error-passive}
error-counter-tx-rx{{128}{0}}
(005.959170) can1 20000002 [8] 03 00 00 00 00 00 00 00 =20
ERRORFRAME
lost-arbitration{at bit 3}
I'm missing an error warning message here... at least on the SJA1000 it=
's
triggered.
(000.428328) can1 20000004 [8] 00 40 00 00 00 00 5F 00 =20
ERRORFRAME
controller-problem{back-to-error-active}
error-counter-tx-rx{{95}{0}}
=20
Post by Wolfgang Grandegger
Post by Wolfgang Grandegger
2. set restart-ms=3D100
send messages with cangen
provoke a bus-off short-circuiting CAN low and high
remove the short-circuit
(044.014170) can1 20000004 [8] 00 20 00 00 00 00 88 00 =20
ERRORFRAME
controller-problem{tx-error-passive}
error-counter-tx-rx{{136}{0}}
(000.003175) can1 20000040 [8] 00 00 00 00 00 00 7F 00 =20
ERRORFRAME
bus-off
error-counter-tx-rx{{127}{0}}
(000.099664) can1 20000100 [8] 00 00 00 00 00 00 00 00 =20
ERRORFRAME
restarted-after-bus-off
(000.097246) can1 20000004 [8] 00 20 00 00 00 00 88 00 =20
ERRORFRAME
controller-problem{tx-error-passive}
error-counter-tx-rx{{136}{0}}
(000.003160) can1 20000040 [8] 00 00 00 00 00 00 7F 00 =20
ERRORFRAME
bus-off
error-counter-tx-rx{{127}{0}}
(000.099602) can1 20000100 [8] 00 00 00 00 00 00 00 00 =20
ERRORFRAME
restarted-after-bus-off
And the restart does come when the short-circuit is gone?

Wolfgang.

--
To unsubscribe from this list: send the line "unsubscribe linux-can" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Andri Yngvason
2014-10-22 12:48:11 UTC
Permalink
Post by Wolfgang Grandegger
On Wed, 22 Oct 2014 11:48:26 +0000, Andri Yngvason
Post by Wolfgang Grandegger
To see if the state changes occur as expected could you please
record
Post by Wolfgang Grandegger
error message traces with candump for the following two scenari=
1. send messages with cangen
disconnect the cable
reconnect the cable after a while until the error active sta=
te
Post by Wolfgang Grandegger
is
Post by Wolfgang Grandegger
reached.
After cleaning up my mess, this is the output for the disconnected c=
able
Post by Wolfgang Grandegger
(000.000000) can1 20000004 [8] 00 08 00 00 00 00 60 00 =20
ERRORFRAME
controller-problem{tx-error-warning}
error-counter-tx-rx{{96}{0}}
(000.003953) can1 20000004 [8] 00 20 00 00 00 00 80 00 =20
ERRORFRAME
controller-problem{tx-error-passive}
error-counter-tx-rx{{128}{0}}
(005.959170) can1 20000002 [8] 03 00 00 00 00 00 00 00 =20
ERRORFRAME
lost-arbitration{at bit 3}
I'm missing an error warning message here... at least on the SJA1000 =
it's
Post by Wolfgang Grandegger
triggered.
I found this peculiar as well. However, I don't get the warning without=
the
patch applied either.
Then I just get:
***@x86-20140911-072109:~# candump -td -e can1,0~0,#FFFFFFFF | tee
candump.log =20
(000.000000) can1 20000004 [8] 00 08 00 00 00 00 60 00 ERRORFRA=
ME
controller-problem{tx-error-warning}
error-counter-tx-rx{{96}{0}}
(000.002425) can1 20000004 [8] 00 20 00 00 00 00 80 00 ERRORFRA=
ME
controller-problem{tx-error-passive}
error-counter-tx-rx{{128}{0}}
Post by Wolfgang Grandegger
(000.428328) can1 20000004 [8] 00 40 00 00 00 00 5F 00 =20
ERRORFRAME
controller-problem{back-to-error-active}
error-counter-tx-rx{{95}{0}}
Post by Wolfgang Grandegger
Post by Wolfgang Grandegger
2. set restart-ms=3D100
send messages with cangen
provoke a bus-off short-circuiting CAN low and high
remove the short-circuit
(044.014170) can1 20000004 [8] 00 20 00 00 00 00 88 00 =20
ERRORFRAME
controller-problem{tx-error-passive}
error-counter-tx-rx{{136}{0}}
(000.003175) can1 20000040 [8] 00 00 00 00 00 00 7F 00 =20
ERRORFRAME
bus-off
error-counter-tx-rx{{127}{0}}
(000.099664) can1 20000100 [8] 00 00 00 00 00 00 00 00 =20
ERRORFRAME
restarted-after-bus-off
(000.097246) can1 20000004 [8] 00 20 00 00 00 00 88 00 =20
ERRORFRAME
controller-problem{tx-error-passive}
error-counter-tx-rx{{136}{0}}
(000.003160) can1 20000040 [8] 00 00 00 00 00 00 7F 00 =20
ERRORFRAME
bus-off
error-counter-tx-rx{{127}{0}}
(000.099602) can1 20000100 [8] 00 00 00 00 00 00 00 00 =20
ERRORFRAME
restarted-after-bus-off
And the restart does come when the short-circuit is gone?
No, in fact the bus keeps restarting until the short-circuit is gone.

Note that I'm using peak_pci. It's sja1000 but maybe there is something=
different?

- Andri

--
To unsubscribe from this list: send the line "unsubscribe linux-can" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Wolfgang Grandegger
2014-10-22 12:56:29 UTC
Permalink
On Wed, 22 Oct 2014 12:48:11 +0000, Andri Yngvason
Post by Wolfgang Grandegger
On Wed, 22 Oct 2014 11:48:26 +0000, Andri Yngvason
Post by Andri Yngvason
Post by Wolfgang Grandegger
To see if the state changes occur as expected could you please
record
Post by Andri Yngvason
Post by Wolfgang Grandegger
error message traces with candump for the following two
1. send messages with cangen
disconnect the cable
reconnect the cable after a while until the error active st=
ate
Post by Wolfgang Grandegger
is
Post by Andri Yngvason
Post by Wolfgang Grandegger
reached.
After cleaning up my mess, this is the output for the disconnected
cable
Post by Wolfgang Grandegger
Post by Andri Yngvason
(000.000000) can1 20000004 [8] 00 08 00 00 00 00 60 00 =20
ERRORFRAME
Post by Andri Yngvason
controller-problem{tx-error-warning}
error-counter-tx-rx{{96}{0}}
(000.003953) can1 20000004 [8] 00 20 00 00 00 00 80 00 =20
ERRORFRAME
Post by Andri Yngvason
controller-problem{tx-error-passive}
error-counter-tx-rx{{128}{0}}
(005.959170) can1 20000002 [8] 03 00 00 00 00 00 00 00 =20
ERRORFRAME
Post by Andri Yngvason
lost-arbitration{at bit 3}
I'm missing an error warning message here... at least on the SJA1000
it's
Post by Wolfgang Grandegger
triggered.
I found this peculiar as well. However, I don't get the warning witho=
ut
the
patch applied either.
Yes, because reporting decreasing state changes are not (yet) supported=
=2E
candump.log =20
(000.000000) can1 20000004 [8] 00 08 00 00 00 00 60 00 =20
ERRORFRAME
controller-problem{tx-error-warning}
error-counter-tx-rx{{96}{0}}
(000.002425) can1 20000004 [8] 00 20 00 00 00 00 80 00 =20
ERRORFRAME
controller-problem{tx-error-passive}
error-counter-tx-rx{{128}{0}}
=20
Could you please include the messages in the trace as well
(using "any,0:0,#FFFFFFFF").
Post by Wolfgang Grandegger
Post by Andri Yngvason
(000.428328) can1 20000004 [8] 00 40 00 00 00 00 5F 00 =20
ERRORFRAME
Post by Andri Yngvason
controller-problem{back-to-error-active}
error-counter-tx-rx{{95}{0}}
Post by Wolfgang Grandegger
Post by Wolfgang Grandegger
2. set restart-ms=3D100
send messages with cangen
provoke a bus-off short-circuiting CAN low and high
remove the short-circuit
(044.014170) can1 20000004 [8] 00 20 00 00 00 00 88 00 =20
ERRORFRAME
Post by Andri Yngvason
controller-problem{tx-error-passive}
error-counter-tx-rx{{136}{0}}
(000.003175) can1 20000040 [8] 00 00 00 00 00 00 7F 00 =20
ERRORFRAME
Post by Andri Yngvason
bus-off
error-counter-tx-rx{{127}{0}}
(000.099664) can1 20000100 [8] 00 00 00 00 00 00 00 00 =20
ERRORFRAME
Post by Andri Yngvason
restarted-after-bus-off
(000.097246) can1 20000004 [8] 00 20 00 00 00 00 88 00 =20
ERRORFRAME
Post by Andri Yngvason
controller-problem{tx-error-passive}
error-counter-tx-rx{{136}{0}}
(000.003160) can1 20000040 [8] 00 00 00 00 00 00 7F 00 =20
ERRORFRAME
Post by Andri Yngvason
bus-off
error-counter-tx-rx{{127}{0}}
(000.099602) can1 20000100 [8] 00 00 00 00 00 00 00 00 =20
ERRORFRAME
Post by Andri Yngvason
restarted-after-bus-off
And the restart does come when the short-circuit is gone?
No, in fact the bus keeps restarting until the short-circuit is gone.
=20
Note that I'm using peak_pci. It's sja1000 but maybe there is somethi=
ng
different?
No, because it's a memory mapped SJA1000 chip. What do you see with
"dmesg"
assuming that CONFIG_CAN_DEV_DEBUG is enabled.

Wolfgang.

--
To unsubscribe from this list: send the line "unsubscribe linux-can" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Andri Yngvason
2014-10-22 16:32:52 UTC
Permalink
Post by Wolfgang Grandegger
Post by Wolfgang Grandegger
Post by Andri Yngvason
restarted-after-bus-off
And the restart does come when the short-circuit is gone?
No, in fact the bus keeps restarting until the short-circuit is gone=
=2E
Post by Wolfgang Grandegger
Note that I'm using peak_pci. It's sja1000 but maybe there is someth=
ing
Post by Wolfgang Grandegger
different?
No, because it's a memory mapped SJA1000 chip. What do you see with
"dmesg"
assuming that CONFIG_CAN_DEV_DEBUG is enabled.
dmesg wasn't very informative so I added some debug output of my own. T=
his is
the output after I've fixed everything:
[ 3903.594640] peak_pci 0000:02:00.0: can1 at reg_base=3D0xf892a400
cfg_base=3D0xf8928000 irq=3D18
[ 3903.684937] peak_pci 0000:02:00.0 can1: setting BTR0=3D0x07 BTR1=3D0=
x05
[ 3922.009853] peak_pci 0000:02:00.0 can1: error warning interrupt; bs=3D=
false,
es=3Dtrue
[ 3922.017428] peak_pci 0000:02:00.0 can1: state=3D1, txerr=3D96, rxerr=
=3D1
[ 3922.023615] peak_pci 0000:02:00.0 can1: error passive interrupt; bs=3D=
false,
es=3Dtrue
[ 3922.031197] peak_pci 0000:02:00.0 can1: state=3D2, txerr=3D128, rxer=
r=3D1
[ 3928.401643] peak_pci 0000:02:00.0 can1: error passive interrupt; bs=3D=
false,
es=3Dtrue
[ 3928.409243] peak_pci 0000:02:00.0 can1: state=3D1, txerr=3D127, rxer=
r=3D0
[ 3934.804395] peak_pci 0000:02:00.0 can1: error warning interrupt; bs=3D=
false,
es=3Dfalse
[ 3934.812075] peak_pci 0000:02:00.0 can1: state=3D0, txerr=3D95, rxerr=
=3D0

Note that, between the two passive interrupts, es does not change. This=
is
correct according to
http://www.nxp.com/documents/data_sheet/SJA1000.pdf because ES only sig=
nifies
warning state.
Thus, "if (status & SR_ES)" is redundant.

@@ -440,11 +446,14 @@ static int sja1000_err(struct net_device *dev, ui=
nt8_t
isrc, uint8_t status)
}
if (isrc & IRQ_EPI) {
/* error passive interrupt */
- netdev_dbg(dev, "error passive interrupt\n");
- if (status & SR_ES)
- state =3D CAN_STATE_ERROR_PASSIVE;
+ netdev_dbg(dev, "error passive interrupt; bs=3D%s, es=3D%s\n",
+ status & SR_BS ? "true" : "false",
+ status & SR_ES ? "true" : "false");
+
+ if (state =3D=3D CAN_STATE_ERROR_PASSIVE)
+ state =3D CAN_STATE_ERROR_WARNING;
else
- state =3D CAN_STATE_ERROR_ACTIVE;
+ state =3D CAN_STATE_ERROR_PASSIVE;
}
if (isrc & IRQ_ALI) {
/* arbitration lost interrupt */

The data sheet says this about EPI:
set; this bit is set whenever the CAN controller has
reached the error passive status (at least one
error counter exceeds the protocol-defined level of
127) or if the CAN controller is in the error passive
status and enters the error active status again and
the EPIE bit is set within the interrupt enable
register

I'm a little bit concerned that it actually says that EPI is set on "er=
ror passive"
and "error passive to error active". However, the log says that txerr=3D=
127 on
that second interrupt. It makes more sense for it to be "error warning"=
=2E Might
this be a error in the datasheet?

Another thing that worries me is that when I enabled debug, txerr would=
advance
while the interrupt was being handled. This yielded mismatches between
the new state and the actual current state of the error counters. I mov=
ed the
reading of the RXERR/TXERR registers to the top of the error handler fu=
nction
and that fixed the problem for me. Might this still be an issue on slow=
er
platforms?

candump looks like this:
=2E..
(000.053403) can0 5B1 [5] D1 D1 BB 7C DF
(000.146664) can0 506 [8] 2C 6E 9C 77 5E F1 AE 2D
(000.053628) can0 488 [8] FF C8 AC 26 43 C5 AF 11
(000.146428) can0 5AD [8] C1 31 BD 1B 6B 8D 34 13
(000.053112) can0 658 [0]
(000.171131) can0 20000004 [8] 00 08 00 00 00 00 60 00 ERRORFRA=
ME
controller-problem{tx-error-warning}
error-counter-tx-rx{{96}{0}}
(000.013841) can0 20000004 [8] 00 20 00 00 00 00 80 00 ERRORFRA=
ME
controller-problem{tx-error-passive}
error-counter-tx-rx{{128}{0}}
(004.433642) can0 20000002 [8] 04 00 00 00 00 00 00 00 ERRORFRA=
ME
lost-arbitration{at bit 4}
(000.014785) can0 20000002 [8] 02 00 00 00 00 00 00 00 ERRORFRA=
ME
lost-arbitration{at bit 2}
(000.012565) can0 20000002 [8] 02 00 00 00 00 00 00 00 ERRORFRA=
ME
lost-arbitration{at bit 2}
(000.000006) can0 152 [8] 48 94 14 45 C3 60 8A 58
(000.000015) can0 6D7 [4] 3A B8 84 26
(000.012537) can0 20000002 [8] 02 00 00 00 00 00 00 00 ERRORFRA=
ME
lost-arbitration{at bit 2}
(000.014083) can0 2E7 [7] A2 6F F8 5F DF DD 3B
(000.000904) can0 757 [8] 4F 0E AB 57 C3 58 DB 75
(000.000680) can0 459 [5] 52 52 58 29 8C
=2E..
(000.000546) can0 3C7 [2] 18 AC
(000.000875) can0 130 [8] B3 57 0A 52 CC CA D3 08
(000.015962) can0 5DF [8] 64 14 6C 61 F9 FD E9 55
(000.000023) can0 5D2 [8] C5 B2 39 6B 65 45 4C 05
(000.014554) can0 20000002 [8] 03 00 00 00 00 00 00 00 ERRORFRA=
ME
lost-arbitration{at bit 3}
(000.000008) can0 2FC [8] 19 73 77 6C 86 91 D5 58
=2E..
(000.053183) can0 7A1 [6] F8 EE 7B 12 A3 43
(000.146734) can0 089 [5] 36 B8 A0 4F DD
(000.014032) can0 20000004 [8] 00 0C 00 00 00 00 7F 74 ERRORFRA=
ME
controller-problem{rx-error-warning,tx-error-warning}
error-counter-tx-rx{{127}{116}}
(000.039316) can0 684 [6] ED D1 1E 32 00 1D
(000.146892) can0 0D1 [8] CD 99 DE 18 62 B0 45 27
(000.053317) can0 687 [8] 43 37 CF 5E 6B A4 CA 3D
=2E..
(000.053561) can0 42A [4] 65 20 3B 5C
(000.146834) can0 3C2 [8] 32 50 A4 56 31 EC DD 55
(000.013948) can0 20000004 [8] 00 40 00 00 00 00 5F 54 ERRORFRA=
ME
controller-problem{back-to-error-active}
error-counter-tx-rx{{95}{84}}
(000.039372) can0 455 [5] 58 38 31 62 B6
=2E..

- Andri.
--
To unsubscribe from this list: send the line "unsubscribe linux-can" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Wolfgang Grandegger
2014-10-22 18:42:05 UTC
Permalink
=20
Post by Wolfgang Grandegger
Post by Wolfgang Grandegger
Post by Andri Yngvason
restarted-after-bus-off
And the restart does come when the short-circuit is gone?
No, in fact the bus keeps restarting until the short-circuit is gon=
e.
Post by Wolfgang Grandegger
Note that I'm using peak_pci. It's sja1000 but maybe there is somet=
hing
Post by Wolfgang Grandegger
different?
No, because it's a memory mapped SJA1000 chip. What do you see with
"dmesg"
assuming that CONFIG_CAN_DEV_DEBUG is enabled.
dmesg wasn't very informative so I added some debug output of my own.=
This is
[ 3903.594640] peak_pci 0000:02:00.0: can1 at reg_base=3D0xf892a400
cfg_base=3D0xf8928000 irq=3D18
[ 3903.684937] peak_pci 0000:02:00.0 can1: setting BTR0=3D0x07 BTR1=3D=
0x05
[ 3922.009853] peak_pci 0000:02:00.0 can1: error warning interrupt; b=
s=3Dfalse,
es=3Dtrue
According to the manual: error active -> warning
[ 3922.017428] peak_pci 0000:02:00.0 can1: state=3D1, txerr=3D96, rxe=
rr=3D1
[ 3922.023615] peak_pci 0000:02:00.0 can1: error passive interrupt; b=
s=3Dfalse,
es=3Dtrue
warning -> error passive
[ 3922.031197] peak_pci 0000:02:00.0 can1: state=3D2, txerr=3D128, rx=
err=3D1
[ 3928.401643] peak_pci 0000:02:00.0 can1: error passive interrupt; b=
s=3Dfalse,
es=3Dtrue
error passive -> warning
[ 3928.409243] peak_pci 0000:02:00.0 can1: state=3D1, txerr=3D127, rx=
err=3D0
[ 3934.804395] peak_pci 0000:02:00.0 can1: error warning interrupt; b=
s=3Dfalse,
es=3Dfalse
warning -> error active
[ 3934.812075] peak_pci 0000:02:00.0 can1: state=3D0, txerr=3D95, rxe=
rr=3D0
=20
Note that, between the two passive interrupts, es does not change. Th=
is is
correct according to
http://www.nxp.com/documents/data_sheet/SJA1000.pdf because ES only s=
ignifies
warning state.
Thus, "if (status & SR_ES)" is redundant.
=20
@@ -440,11 +446,14 @@ static int sja1000_err(struct net_device *dev, =
uint8_t
isrc, uint8_t status)
}
if (isrc & IRQ_EPI) {
/* error passive interrupt */
- netdev_dbg(dev, "error passive interrupt\n");
- if (status & SR_ES)
- state =3D CAN_STATE_ERROR_PASSIVE;
+ netdev_dbg(dev, "error passive interrupt; bs=3D%s, es=3D%s\n=
",
+ status & SR_BS ? "true" : "false",
+ status & SR_ES ? "true" : "false");
+
+ if (state =3D=3D CAN_STATE_ERROR_PASSIVE)
+ state =3D CAN_STATE_ERROR_WARNING;
else
- state =3D CAN_STATE_ERROR_ACTIVE;
+ state =3D CAN_STATE_ERROR_PASSIVE;
}
if (isrc & IRQ_ALI) {
/* arbitration lost interrupt */
=20
set; this bit is set whenever the CAN controller has
reached the error passive status (at least one
error counter exceeds the protocol-defined level of
127) or if the CAN controller is in the error passive
status and enters the error active status again and
the EPIE bit is set within the interrupt enable
register
=20
I'm a little bit concerned that it actually says that EPI is set on "=
error passive"
and "error passive to error active". However, the log says that txerr=
=3D127 on
that second interrupt. It makes more sense for it to be "error warnin=
g". Might
this be a error in the datasheet?
IIRC, the CAN standard only knows error active and error passive.
Various vendors added the warning level where the level is sometimes
even programmable.
Another thing that worries me is that when I enabled debug, txerr wou=
ld advance
while the interrupt was being handled. This yielded mismatches betwee=
n
the new state and the actual current state of the error counters. I m=
oved the
reading of the RXERR/TXERR registers to the top of the error handler =
function
and that fixed the problem for me. Might this still be an issue on sl=
ower
platforms?
Yes, strictly speaking we cannot be sure that we read the correct error
counter when the state change is triggered via interrupt.
...
(000.053403) can0 5B1 [5] D1 D1 BB 7C DF
(000.146664) can0 506 [8] 2C 6E 9C 77 5E F1 AE 2D
(000.053628) can0 488 [8] FF C8 AC 26 43 C5 AF 11
(000.146428) can0 5AD [8] C1 31 BD 1B 6B 8D 34 13
(000.053112) can0 658 [0]
(000.171131) can0 20000004 [8] 00 08 00 00 00 00 60 00 ERRORF=
RAME
controller-problem{tx-error-warning}
error-counter-tx-rx{{96}{0}}
(000.013841) can0 20000004 [8] 00 20 00 00 00 00 80 00 ERRORF=
RAME
controller-problem{tx-error-passive}
error-counter-tx-rx{{128}{0}}
(004.433642) can0 20000002 [8] 04 00 00 00 00 00 00 00 ERRORF=
RAME
lost-arbitration{at bit 4}
(000.014785) can0 20000002 [8] 02 00 00 00 00 00 00 00 ERRORF=
RAME
lost-arbitration{at bit 2}
(000.012565) can0 20000002 [8] 02 00 00 00 00 00 00 00 ERRORF=
RAME
lost-arbitration{at bit 2}
(000.000006) can0 152 [8] 48 94 14 45 C3 60 8A 58
(000.000015) can0 6D7 [4] 3A B8 84 26
(000.012537) can0 20000002 [8] 02 00 00 00 00 00 00 00 ERRORF=
RAME
lost-arbitration{at bit 2}
Why do you see these errors? Are there electrical problems on the CAN
bus? And if no cable is connected just txerr should increase (and
decrease if it's reconnected!
(000.014083) can0 2E7 [7] A2 6F F8 5F DF DD 3B
(000.000904) can0 757 [8] 4F 0E AB 57 C3 58 DB 75
(000.000680) can0 459 [5] 52 52 58 29 8C
...
(000.000546) can0 3C7 [2] 18 AC
(000.000875) can0 130 [8] B3 57 0A 52 CC CA D3 08
(000.015962) can0 5DF [8] 64 14 6C 61 F9 FD E9 55
(000.000023) can0 5D2 [8] C5 B2 39 6B 65 45 4C 05
(000.014554) can0 20000002 [8] 03 00 00 00 00 00 00 00 ERRORF=
RAME
lost-arbitration{at bit 3}
(000.000008) can0 2FC [8] 19 73 77 6C 86 91 D5 58
...
(000.053183) can0 7A1 [6] F8 EE 7B 12 A3 43
(000.146734) can0 089 [5] 36 B8 A0 4F DD
(000.014032) can0 20000004 [8] 00 0C 00 00 00 00 7F 74 ERRORF=
RAME
controller-problem{rx-error-warning,tx-error-warning}
error-counter-tx-rx{{127}{116}}
(000.039316) can0 684 [6] ED D1 1E 32 00 1D
(000.146892) can0 0D1 [8] CD 99 DE 18 62 B0 45 27
(000.053317) can0 687 [8] 43 37 CF 5E 6B A4 CA 3D
...
(000.053561) can0 42A [4] 65 20 3B 5C
(000.146834) can0 3C2 [8] 32 50 A4 56 31 EC DD 55
(000.013948) can0 20000004 [8] 00 40 00 00 00 00 5F 54 ERRORF=
RAME
controller-problem{back-to-error-active}
error-counter-tx-rx{{95}{84}}
(000.039372) can0 455 [5] 58 38 31 62 B6
...
Wolfgang.

--
To unsubscribe from this list: send the line "unsubscribe linux-can" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Andri Yngvason
2014-10-23 12:50:20 UTC
Permalink
Post by Wolfgang Grandegger
Post by Wolfgang Grandegger
Post by Wolfgang Grandegger
Post by Andri Yngvason
restarted-after-bus-off
And the restart does come when the short-circuit is gone?
No, in fact the bus keeps restarting until the short-circuit is go=
ne.
Post by Wolfgang Grandegger
Post by Wolfgang Grandegger
Note that I'm using peak_pci. It's sja1000 but maybe there is some=
thing
Post by Wolfgang Grandegger
Post by Wolfgang Grandegger
different?
No, because it's a memory mapped SJA1000 chip. What do you see with
"dmesg"
assuming that CONFIG_CAN_DEV_DEBUG is enabled.
dmesg wasn't very informative so I added some debug output of my own=
=2E This is
Post by Wolfgang Grandegger
[ 3903.594640] peak_pci 0000:02:00.0: can1 at reg_base=3D0xf892a400
cfg_base=3D0xf8928000 irq=3D18
[ 3903.684937] peak_pci 0000:02:00.0 can1: setting BTR0=3D0x07 BTR1=3D=
0x05
Post by Wolfgang Grandegger
[ 3922.009853] peak_pci 0000:02:00.0 can1: error warning interrupt; =
bs=3Dfalse,
Post by Wolfgang Grandegger
es=3Dtrue
According to the manual: error active -> warning
[ 3922.017428] peak_pci 0000:02:00.0 can1: state=3D1, txerr=3D96, rx=
err=3D1
Post by Wolfgang Grandegger
[ 3922.023615] peak_pci 0000:02:00.0 can1: error passive interrupt; =
bs=3Dfalse,
Post by Wolfgang Grandegger
es=3Dtrue
warning -> error passive
[ 3922.031197] peak_pci 0000:02:00.0 can1: state=3D2, txerr=3D128, r=
xerr=3D1
Post by Wolfgang Grandegger
[ 3928.401643] peak_pci 0000:02:00.0 can1: error passive interrupt; =
bs=3Dfalse,
Post by Wolfgang Grandegger
es=3Dtrue
error passive -> warning
[ 3928.409243] peak_pci 0000:02:00.0 can1: state=3D1, txerr=3D127, r=
xerr=3D0
Post by Wolfgang Grandegger
[ 3934.804395] peak_pci 0000:02:00.0 can1: error warning interrupt; =
bs=3Dfalse,
Post by Wolfgang Grandegger
es=3Dfalse
warning -> error active
[ 3934.812075] peak_pci 0000:02:00.0 can1: state=3D0, txerr=3D95, rx=
err=3D0
Post by Wolfgang Grandegger
Note that, between the two passive interrupts, es does not change. T=
his is
Post by Wolfgang Grandegger
correct according to
http://www.nxp.com/documents/data_sheet/SJA1000.pdf because ES only =
signifies
Post by Wolfgang Grandegger
warning state.
Thus, "if (status & SR_ES)" is redundant.
@@ -440,11 +446,14 @@ static int sja1000_err(struct net_device *dev,=
uint8_t
Post by Wolfgang Grandegger
isrc, uint8_t status)
}
if (isrc & IRQ_EPI) {
/* error passive interrupt */
- netdev_dbg(dev, "error passive interrupt\n");
- if (status & SR_ES)
- state =3D CAN_STATE_ERROR_PASSIVE;
+ netdev_dbg(dev, "error passive interrupt; bs=3D%s, es=3D%s\=
n",
Post by Wolfgang Grandegger
+ status & SR_BS ? "true" : "false",
+ status & SR_ES ? "true" : "false");
+
+ if (state =3D=3D CAN_STATE_ERROR_PASSIVE)
+ state =3D CAN_STATE_ERROR_WARNING;
else
- state =3D CAN_STATE_ERROR_ACTIVE;
+ state =3D CAN_STATE_ERROR_PASSIVE;
}
if (isrc & IRQ_ALI) {
/* arbitration lost interrupt */
set; this bit is set whenever the CAN controller has
reached the error passive status (at least one
error counter exceeds the protocol-defined level of
127) or if the CAN controller is in the error passive
status and enters the error active status again and
the EPIE bit is set within the interrupt enable
register
I'm a little bit concerned that it actually says that EPI is set on =
"error passive"
Post by Wolfgang Grandegger
and "error passive to error active". However, the log says that txer=
r=3D127 on
Post by Wolfgang Grandegger
that second interrupt. It makes more sense for it to be "error warni=
ng". Might
Post by Wolfgang Grandegger
this be a error in the datasheet?
IIRC, the CAN standard only knows error active and error passive.
Various vendors added the warning level where the level is sometimes
even programmable.
Yes, this makes sense. The manual doesn't even say that the "warning le=
vel" is a
state.
Post by Wolfgang Grandegger
Another thing that worries me is that when I enabled debug, txerr wo=
uld advance
Post by Wolfgang Grandegger
while the interrupt was being handled. This yielded mismatches betwe=
en
Post by Wolfgang Grandegger
the new state and the actual current state of the error counters. I =
moved the
Post by Wolfgang Grandegger
reading of the RXERR/TXERR registers to the top of the error handler=
function
Post by Wolfgang Grandegger
and that fixed the problem for me. Might this still be an issue on s=
lower
Post by Wolfgang Grandegger
platforms?
Yes, strictly speaking we cannot be sure that we read the correct err=
or
Post by Wolfgang Grandegger
counter when the state change is triggered via interrupt.
Ok, we'll just have to compare rxerr and txerr like before instead of u=
sing
errcnt_to_state.
Post by Wolfgang Grandegger
...
(000.053403) can0 5B1 [5] D1 D1 BB 7C DF
(000.146664) can0 506 [8] 2C 6E 9C 77 5E F1 AE 2D
(000.053628) can0 488 [8] FF C8 AC 26 43 C5 AF 11
(000.146428) can0 5AD [8] C1 31 BD 1B 6B 8D 34 13
(000.053112) can0 658 [0]
(000.171131) can0 20000004 [8] 00 08 00 00 00 00 60 00 ERROR=
=46RAME
Post by Wolfgang Grandegger
controller-problem{tx-error-warning}
error-counter-tx-rx{{96}{0}}
(000.013841) can0 20000004 [8] 00 20 00 00 00 00 80 00 ERROR=
=46RAME
Post by Wolfgang Grandegger
controller-problem{tx-error-passive}
error-counter-tx-rx{{128}{0}}
(004.433642) can0 20000002 [8] 04 00 00 00 00 00 00 00 ERROR=
=46RAME
Post by Wolfgang Grandegger
lost-arbitration{at bit 4}
(000.014785) can0 20000002 [8] 02 00 00 00 00 00 00 00 ERROR=
=46RAME
Post by Wolfgang Grandegger
lost-arbitration{at bit 2}
(000.012565) can0 20000002 [8] 02 00 00 00 00 00 00 00 ERROR=
=46RAME
Post by Wolfgang Grandegger
lost-arbitration{at bit 2}
(000.000006) can0 152 [8] 48 94 14 45 C3 60 8A 58
(000.000015) can0 6D7 [4] 3A B8 84 26
(000.012537) can0 20000002 [8] 02 00 00 00 00 00 00 00 ERROR=
=46RAME
Post by Wolfgang Grandegger
lost-arbitration{at bit 2}
Why do you see these errors? Are there electrical problems on the CAN
bus? And if no cable is connected just txerr should increase (and
decrease if it's reconnected!
These errors are occurring when I connect the cable again. They might b=
e
due to bad contact while plugging in the cable.

Anyway. Like I said before, the controller does not enter error active =
state unless
there is some other device sending on the bus. I've looked at "dmesg" a=
nd there
isn't even an interrupt. The netlink interface also says error-warning.

When you were doing your experiments, did you perhaps have some node on
the bus that might have answered to some of the cangen messages?

In any case, my setup is like this:
* Two sja1000 from the same peak_pci connected to the same bus.
* Both send using cangen
* Resistance: 60 Ohm
* Cables are quite short.

I'll try to place the resistances differently, make cables shorter and =
see if that
does anything.
Post by Wolfgang Grandegger
(000.014083) can0 2E7 [7] A2 6F F8 5F DF DD 3B
(000.000904) can0 757 [8] 4F 0E AB 57 C3 58 DB 75
(000.000680) can0 459 [5] 52 52 58 29 8C
...
(000.000546) can0 3C7 [2] 18 AC
(000.000875) can0 130 [8] B3 57 0A 52 CC CA D3 08
(000.015962) can0 5DF [8] 64 14 6C 61 F9 FD E9 55
(000.000023) can0 5D2 [8] C5 B2 39 6B 65 45 4C 05
(000.014554) can0 20000002 [8] 03 00 00 00 00 00 00 00 ERROR=
=46RAME
Post by Wolfgang Grandegger
lost-arbitration{at bit 3}
(000.000008) can0 2FC [8] 19 73 77 6C 86 91 D5 58
...
(000.053183) can0 7A1 [6] F8 EE 7B 12 A3 43
(000.146734) can0 089 [5] 36 B8 A0 4F DD
(000.014032) can0 20000004 [8] 00 0C 00 00 00 00 7F 74 ERROR=
=46RAME
Post by Wolfgang Grandegger
controller-problem{rx-error-warning,tx-error-warning}
error-counter-tx-rx{{127}{116}}
(000.039316) can0 684 [6] ED D1 1E 32 00 1D
(000.146892) can0 0D1 [8] CD 99 DE 18 62 B0 45 27
(000.053317) can0 687 [8] 43 37 CF 5E 6B A4 CA 3D
...
(000.053561) can0 42A [4] 65 20 3B 5C
(000.146834) can0 3C2 [8] 32 50 A4 56 31 EC DD 55
(000.013948) can0 20000004 [8] 00 40 00 00 00 00 5F 54 ERROR=
=46RAME
Post by Wolfgang Grandegger
controller-problem{back-to-error-active}
error-counter-tx-rx{{95}{84}}
(000.039372) can0 455 [5] 58 38 31 62 B6
...
- Andri
--
To unsubscribe from this list: send the line "unsubscribe linux-can" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Wolfgang Grandegger
2014-10-23 13:10:14 UTC
Permalink
On Thu, 23 Oct 2014 12:50:20 +0000, Andri Yngvason
=2E..
Post by Andri Yngvason
Post by Wolfgang Grandegger
Post by Andri Yngvason
set; this bit is set whenever the CAN controller has
reached the error passive status (at least one
error counter exceeds the protocol-defined level of
127) or if the CAN controller is in the error passive
status and enters the error active status again and
the EPIE bit is set within the interrupt enable
register
I'm a little bit concerned that it actually says that EPI is set on
"error passive"
and "error passive to error active". However, the log says that
txerr=3D127 on
that second interrupt. It makes more sense for it to be "error
warning". Might
this be a error in the datasheet?
IIRC, the CAN standard only knows error active and error passive.
Various vendors added the warning level where the level is sometimes
even programmable.
Yes, this makes sense. The manual doesn't even say that the "warning
level" is a
state.
=2E..
Post by Andri Yngvason
Post by Wolfgang Grandegger
Post by Andri Yngvason
(000.053403) can0 5B1 [5] D1 D1 BB 7C DF
(000.146664) can0 506 [8] 2C 6E 9C 77 5E F1 AE 2D
(000.053628) can0 488 [8] FF C8 AC 26 43 C5 AF 11
(000.146428) can0 5AD [8] C1 31 BD 1B 6B 8D 34 13
(000.053112) can0 658 [0]
(000.171131) can0 20000004 [8] 00 08 00 00 00 00 60 00 =20
ERRORFRAME
controller-problem{tx-error-warning}
error-counter-tx-rx{{96}{0}}
(000.013841) can0 20000004 [8] 00 20 00 00 00 00 80 00 =20
ERRORFRAME
controller-problem{tx-error-passive}
error-counter-tx-rx{{128}{0}}
(004.433642) can0 20000002 [8] 04 00 00 00 00 00 00 00 =20
ERRORFRAME
lost-arbitration{at bit 4}
(000.014785) can0 20000002 [8] 02 00 00 00 00 00 00 00 =20
ERRORFRAME
lost-arbitration{at bit 2}
(000.012565) can0 20000002 [8] 02 00 00 00 00 00 00 00 =20
ERRORFRAME
lost-arbitration{at bit 2}
(000.000006) can0 152 [8] 48 94 14 45 C3 60 8A 58
(000.000015) can0 6D7 [4] 3A B8 84 26
(000.012537) can0 20000002 [8] 02 00 00 00 00 00 00 00 =20
ERRORFRAME
lost-arbitration{at bit 2}
Why do you see these errors? Are there electrical problems on the CA=
N
Post by Andri Yngvason
Post by Wolfgang Grandegger
bus? And if no cable is connected just txerr should increase (and
decrease if it's reconnected!
These errors are occurring when I connect the cable again. They might=
be
Post by Andri Yngvason
due to bad contact while plugging in the cable.
But you already receive *good* messages!
Post by Andri Yngvason
Anyway. Like I said before, the controller does not enter error activ=
e
Post by Andri Yngvason
state unless
there is some other device sending on the bus. I've looked at "dmesg"
and
Post by Andri Yngvason
there
isn't even an interrupt. The netlink interface also says error-warnin=
g.

=46rom http://www.kvaser.com/about-can/the-can-protocol/can-error-handl=
ing/

"The rules for increasing and decreasing the error counters are somewha=
t
complex, but the principle is simple: transmit errors give 8 error poin=
ts,
and receive errors give 1 error point. Correctly transmitted and/or
received messages causes the counter(s) to decrease."
=20
Post by Andri Yngvason
When you were doing your experiments, did you perhaps have some node =
on
Post by Andri Yngvason
the bus that might have answered to some of the cangen messages?
=20
* Two sja1000 from the same peak_pci connected to the same bus.
* Both send using cangen
* Resistance: 60 Ohm
Hm, the cable should be terminated with 120 Ohm on both ends of the cab=
le.
BTW: what bitrate do you use?

Wolfgang.
--
To unsubscribe from this list: send the line "unsubscribe linux-can" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Andri Yngvason
2014-10-23 15:55:54 UTC
Permalink
Post by Wolfgang Grandegger
On Thu, 23 Oct 2014 12:50:20 +0000, Andri Yngvason
...
Post by Andri Yngvason
Post by Wolfgang Grandegger
Post by Andri Yngvason
set; this bit is set whenever the CAN controller has
reached the error passive status (at least one
error counter exceeds the protocol-defined level of
127) or if the CAN controller is in the error passive
status and enters the error active status again and
the EPIE bit is set within the interrupt enable
register
I'm a little bit concerned that it actually says that EPI is set o=
n
Post by Wolfgang Grandegger
Post by Andri Yngvason
Post by Wolfgang Grandegger
Post by Andri Yngvason
"error passive"
and "error passive to error active". However, the log says that
txerr=3D127 on
that second interrupt. It makes more sense for it to be "error
warning". Might
this be a error in the datasheet?
IIRC, the CAN standard only knows error active and error passive.
Various vendors added the warning level where the level is sometime=
s
Post by Wolfgang Grandegger
Post by Andri Yngvason
Post by Wolfgang Grandegger
even programmable.
Yes, this makes sense. The manual doesn't even say that the "warning
level" is a
state.
...
Post by Andri Yngvason
Post by Wolfgang Grandegger
Post by Andri Yngvason
(000.053403) can0 5B1 [5] D1 D1 BB 7C DF
(000.146664) can0 506 [8] 2C 6E 9C 77 5E F1 AE 2D
(000.053628) can0 488 [8] FF C8 AC 26 43 C5 AF 11
(000.146428) can0 5AD [8] C1 31 BD 1B 6B 8D 34 13
(000.053112) can0 658 [0]
(000.171131) can0 20000004 [8] 00 08 00 00 00 00 60 00 =20
ERRORFRAME
controller-problem{tx-error-warning}
error-counter-tx-rx{{96}{0}}
(000.013841) can0 20000004 [8] 00 20 00 00 00 00 80 00 =20
ERRORFRAME
controller-problem{tx-error-passive}
error-counter-tx-rx{{128}{0}}
(004.433642) can0 20000002 [8] 04 00 00 00 00 00 00 00 =20
ERRORFRAME
lost-arbitration{at bit 4}
(000.014785) can0 20000002 [8] 02 00 00 00 00 00 00 00 =20
ERRORFRAME
lost-arbitration{at bit 2}
(000.012565) can0 20000002 [8] 02 00 00 00 00 00 00 00 =20
ERRORFRAME
lost-arbitration{at bit 2}
(000.000006) can0 152 [8] 48 94 14 45 C3 60 8A 58
(000.000015) can0 6D7 [4] 3A B8 84 26
(000.012537) can0 20000002 [8] 02 00 00 00 00 00 00 00 =20
ERRORFRAME
lost-arbitration{at bit 2}
Why do you see these errors? Are there electrical problems on the C=
AN
Post by Wolfgang Grandegger
Post by Andri Yngvason
Post by Wolfgang Grandegger
bus? And if no cable is connected just txerr should increase (and
decrease if it's reconnected!
These errors are occurring when I connect the cable again. They migh=
t be
Post by Wolfgang Grandegger
Post by Andri Yngvason
due to bad contact while plugging in the cable.
But you already receive *good* messages!
Post by Andri Yngvason
Anyway. Like I said before, the controller does not enter error acti=
ve
Post by Wolfgang Grandegger
Post by Andri Yngvason
state unless
there is some other device sending on the bus. I've looked at "dmesg=
"
Post by Wolfgang Grandegger
and
Post by Andri Yngvason
there
isn't even an interrupt. The netlink interface also says error-warni=
ng.
Post by Wolfgang Grandegger
Post by Andri Yngvason
From http://www.kvaser.com/about-can/the-can-protocol/can-error-hand=
ling/
Post by Wolfgang Grandegger
"The rules for increasing and decreasing the error counters are somew=
hat
Post by Wolfgang Grandegger
complex, but the principle is simple: transmit errors give 8 error po=
ints,
Post by Wolfgang Grandegger
and receive errors give 1 error point. Correctly transmitted and/or
received messages causes the counter(s) to decrease."
=20
Post by Andri Yngvason
When you were doing your experiments, did you perhaps have some node=
on
Post by Wolfgang Grandegger
Post by Andri Yngvason
the bus that might have answered to some of the cangen messages?
* Two sja1000 from the same peak_pci connected to the same bus.
* Both send using cangen
* Resistance: 60 Ohm
Hm, the cable should be terminated with 120 Ohm on both ends of the c=
able.
Post by Wolfgang Grandegger
BTW: what bitrate do you use?
The bitrate is 125k.
***@x86-20140911-072109:~# ip -s -d link show can0
15: can0: <NOARP,UP,LOWER_UP,ECHO> mtu 16 qdisc pfifo_fast state UP qle=
n 300
link/can
can state ERROR-ACTIVE restart-ms 50
bitrate 125000 sample-point 0.875
tq 1000 prop-seg 3 phase-seg1 3 phase-seg2 1 sjw 1
sja1000: tseg1 1..16 tseg2 1..8 sjw 1..4 brp 1..64 brp-inc 1
clock 8000000
re-started bus-errors arbit-lost error-warn error-pass bus-off
1 0 28 11 11 2 =20
RX: bytes packets errors dropped overrun mcast =20
2216731 385309 0 0 0 0 =20
TX: bytes packets errors dropped carrier collsns
2523021 391003 28 1 0 0 =20


Anyway, I got this working as you as you said it should work!

I moved the terminating resistors so that can0 and can1 are always term=
inated,
even when disconnected from the bus. I'm not sure why this works, but i=
t does.

Maybe the problem had something to do with the fact that can0 was just =
floating
when I disconnected it.

...
(000.200059) can0 761 [1] 19
(000.200358) can0 7C9 [5] 02 D4 FA 22 2F
(000.200232) can0 180 [7] B8 BF 37 5E D0 EB 3C
(000.224712) can0 20000004 [8] 00 08 00 00 00 00 60 00 ERRORFRA=
ME
controller-problem{tx-error-warning}
error-counter-tx-rx{{96}{0}}
(000.013858) can0 20000004 [8] 00 20 00 00 00 00 80 00 ERRORFRA=
ME
controller-problem{tx-error-passive}
error-counter-tx-rx{{128}{0}}
(006.130890) can0 20F [8] 4F A3 0A 5E FA F9 E0 58
(000.000980) can0 539 [8] BF 4F 72 7C 82 E6 46 72
(000.000513) can0 70A [1] 87
...
(000.000914) can0 14F [7] 43 22 3E 76 B8 8E 2E
(000.001031) can0 7B6 [8] 31 EE 34 79 0E FB C0 0B
(000.000687) can0 73D [4] A0 B3 C1 35
(000.013928) can0 20000004 [8] 00 08 00 00 00 00 7F 00 ERRORFRA=
ME
controller-problem{tx-error-warning}
error-counter-tx-rx{{127}{0}}
(000.000618) can0 0D6 [3] 08 D5 D2
(000.000689) can0 14E [4] 20 A2 A0 3A
(000.000517) can0 7C0 [1] 95
=2E..
(000.199771) can0 7B5 [3] 24 1D 09
(000.200256) can0 632 [6] D9 29 E9 7A 34 71
(000.200170) can0 194 [7] F8 B4 50 67 2B 9E BB
(000.013881) can0 20000004 [8] 00 40 00 00 00 00 5F 00 ERRORFRA=
ME
controller-problem{back-to-error-active}
error-counter-tx-rx{{95}{0}}
(000.185735) can0 71B [0]
(000.200622) can0 090 [8] 62 29 1E 61 35 E3 FC 09
(000.200080) can0 628 [8] C0 54 25 7B CE 56 02 7D
(000.200605) can0 13A [8] 8F 07 70 3A B2 E5 82 29
...


- Andri
--
To unsubscribe from this list: send the line "unsubscribe linux-can" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Loading...