Discussion:
[RESEND] [PATCH] net: CAN: at91_can.c: decrease likelyhood of RX overruns
David Jander
2014-06-26 09:41:26 UTC
Permalink
Use an RX kfifo to empty receive message boxes as soon as possible in
the interrupt handler to avoid RX overruns if napi polls are late due to
latency.

Signed-off-by: David Jander <***@protonic.nl>
---
drivers/net/can/at91_can.c | 100 ++++++++++++++++++++++++++++++++-------------
1 file changed, 71 insertions(+), 29 deletions(-)

diff --git a/drivers/net/can/at91_can.c b/drivers/net/can/at91_can.c
index 6ee1acd..1c53a44 100644
--- a/drivers/net/can/at91_can.c
+++ b/drivers/net/can/at91_can.c
@@ -24,6 +24,7 @@
#include <linux/if_arp.h>
#include <linux/interrupt.h>
#include <linux/kernel.h>
+#include <linux/kfifo.h>
#include <linux/module.h>
#include <linux/netdevice.h>
#include <linux/of.h>
@@ -153,6 +154,16 @@ struct at91_priv {
struct at91_can_data *pdata;

canid_t mb0_id;
+
+/*
+ * The AT91 SoC CAN controller (specially the one in some newer SoCs)
+ * has very little message boxes. On a busy high-speed network, latency
+ * may be too high for napi to catch up before RX overrun occurs.
+ * Therefor we declare a big enough kfifo and fill it directly from
+ * interrupt.
+ */
+#define RX_KFIFO_SIZE 512
+ DECLARE_KFIFO_PTR(rx_fifo, struct sk_buff *);
};

static const struct at91_devtype_data at91_at91sam9263_data = {
@@ -449,6 +460,26 @@ static void at91_chip_stop(struct net_device *dev, enum can_state state)
priv->can.state = state;
}

+static int at91_rx_fifo_in(struct net_device *dev, struct sk_buff *skb)
+{
+ struct at91_priv *priv = netdev_priv(dev);
+ unsigned int len = kfifo_put(&priv->rx_fifo, skb);
+
+ if (len == sizeof(skb))
+ return 0;
+ return -ENOMEM;
+}
+
+static int at91_rx_fifo_out(struct net_device *dev, struct sk_buff **skb)
+{
+ struct at91_priv *priv = netdev_priv(dev);
+ unsigned int len = kfifo_get(&priv->rx_fifo, skb);
+
+ if (len == sizeof(skb))
+ return 0;
+ return -ENOENT;
+}
+
/*
* theory of operation:
*
@@ -578,7 +609,7 @@ static void at91_rx_overflow_err(struct net_device *dev)

cf->can_id |= CAN_ERR_CRTL;
cf->data[1] = CAN_ERR_CRTL_RX_OVERFLOW;
- netif_receive_skb(skb);
+ at91_rx_fifo_in(dev, skb);

stats->rx_packets++;
stats->rx_bytes += cf->can_dlc;
@@ -643,7 +674,7 @@ static void at91_read_msg(struct net_device *dev, unsigned int mb)
}

at91_read_mb(dev, mb, cf);
- netif_receive_skb(skb);
+ at91_rx_fifo_in(dev, skb);

stats->rx_packets++;
stats->rx_bytes += cf->can_dlc;
@@ -700,7 +731,7 @@ static void at91_read_msg(struct net_device *dev, unsigned int mb)
* quota.
*
*/
-static int at91_poll_rx(struct net_device *dev, int quota)
+static int at91_poll_rx(struct net_device *dev)
{
struct at91_priv *priv = netdev_priv(dev);
u32 reg_sr = at91_read(priv, AT91_SR);
@@ -708,14 +739,9 @@ static int at91_poll_rx(struct net_device *dev, int quota)
unsigned int mb;
int received = 0;

- if (priv->rx_next > get_mb_rx_low_last(priv) &&
- reg_sr & get_mb_rx_low_mask(priv))
- netdev_info(dev,
- "order of incoming frames cannot be guaranteed\n");
-
again:
for (mb = find_next_bit(addr, get_mb_tx_first(priv), priv->rx_next);
- mb < get_mb_tx_first(priv) && quota > 0;
+ mb < get_mb_tx_first(priv);
reg_sr = at91_read(priv, AT91_SR),
mb = find_next_bit(addr, get_mb_tx_first(priv), ++priv->rx_next)) {
at91_read_msg(dev, mb);
@@ -729,12 +755,11 @@ static int at91_poll_rx(struct net_device *dev, int quota)
at91_activate_rx_mb(priv, mb);

received++;
- quota--;
}

/* upper group completed, look again in lower */
if (priv->rx_next > get_mb_rx_low_last(priv) &&
- quota > 0 && mb > get_mb_rx_last(priv)) {
+ mb > get_mb_rx_last(priv)) {
priv->rx_next = get_mb_rx_first(priv);
goto again;
}
@@ -790,20 +815,17 @@ static void at91_poll_err_frame(struct net_device *dev,
}
}

-static int at91_poll_err(struct net_device *dev, int quota, u32 reg_sr)
+static int at91_poll_err(struct net_device *dev, u32 reg_sr)
{
struct sk_buff *skb;
struct can_frame *cf;

- if (quota == 0)
- return 0;
-
skb = alloc_can_err_skb(dev, &cf);
if (unlikely(!skb))
return 0;

at91_poll_err_frame(dev, cf, reg_sr);
- netif_receive_skb(skb);
+ at91_rx_fifo_in(dev, skb);

dev->stats.rx_packets++;
dev->stats.rx_bytes += cf->can_dlc;
@@ -811,15 +833,14 @@ static int at91_poll_err(struct net_device *dev, int quota, u32 reg_sr)
return 1;
}

-static int at91_poll(struct napi_struct *napi, int quota)
+static void at91_poll(struct net_device *dev)
{
- struct net_device *dev = napi->dev;
const struct at91_priv *priv = netdev_priv(dev);
u32 reg_sr = at91_read(priv, AT91_SR);
- int work_done = 0;
+ u32 reg_ier;

if (reg_sr & get_irq_mb_rx(priv))
- work_done += at91_poll_rx(dev, quota - work_done);
+ at91_poll_rx(dev);

/*
* The error bits are clear on read,
@@ -827,17 +848,30 @@ static int at91_poll(struct napi_struct *napi, int quota)
*/
reg_sr |= priv->reg_sr;
if (reg_sr & AT91_IRQ_ERR_FRAME)
- work_done += at91_poll_err(dev, quota - work_done, reg_sr);
+ at91_poll_err(dev, reg_sr);

- if (work_done < quota) {
- /* enable IRQs for frame errors and all mailboxes >= rx_next */
- u32 reg_ier = AT91_IRQ_ERR_FRAME;
- reg_ier |= get_irq_mb_rx(priv) & ~AT91_MB_MASK(priv->rx_next);
+ /* enable IRQs for frame errors and all mailboxes >= rx_next */
+ reg_ier = AT91_IRQ_ERR_FRAME;
+ reg_ier |= get_irq_mb_rx(priv) & ~AT91_MB_MASK(priv->rx_next);
+ at91_write(priv, AT91_IER, reg_ier);
+}

- napi_complete(napi);
- at91_write(priv, AT91_IER, reg_ier);
+static int at91_napi_poll(struct napi_struct *napi, int quota)
+{
+ struct net_device *dev = napi->dev;
+ const struct at91_priv *priv = netdev_priv(dev);
+ int work_done = 0;
+ struct sk_buff *skb = NULL;
+
+ while(!(kfifo_is_empty(&priv->rx_fifo)) && (work_done < quota)) {
+ at91_rx_fifo_out(dev, &skb);
+ netif_receive_skb(skb);
+ work_done ++;
}

+ if(work_done < quota)
+ napi_complete(napi);
+
return work_done;
}

@@ -1096,7 +1130,7 @@ static irqreturn_t at91_irq(int irq, void *dev_id)

handled = IRQ_HANDLED;

- /* Receive or error interrupt? -> napi */
+ /* Receive or error interrupt? -> put in rx_fifo and call napi */
if (reg_sr & (get_irq_mb_rx(priv) | AT91_IRQ_ERR_FRAME)) {
/*
* The error bits are clear on read,
@@ -1105,6 +1139,7 @@ static irqreturn_t at91_irq(int irq, void *dev_id)
priv->reg_sr = reg_sr;
at91_write(priv, AT91_IDR,
get_irq_mb_rx(priv) | AT91_IRQ_ERR_FRAME);
+ at91_poll(dev);
napi_schedule(&priv->napi);
}

@@ -1356,7 +1391,14 @@ static int at91_can_probe(struct platform_device *pdev)
priv->pdata = dev_get_platdata(&pdev->dev);
priv->mb0_id = 0x7ff;

- netif_napi_add(dev, &priv->napi, at91_poll, get_mb_rx_num(priv));
+ err = kfifo_alloc(&priv->rx_fifo, RX_KFIFO_SIZE, GFP_KERNEL);
+ if (err) {
+ dev_err(&pdev->dev, "allocating RX fifo failed\n");
+ goto exit_iounmap;
+ }
+
+ netif_napi_add(dev, &priv->napi, at91_napi_poll,
+ RX_KFIFO_SIZE > 64 ? 64 : RX_KFIFO_SIZE);

if (at91_is_sam9263(priv))
dev->sysfs_groups[0] = &at91_sysfs_attr_group;
--
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe linux-can" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Alexander Stein
2014-10-02 12:41:25 UTC
Permalink
Hello David,

finally I got the chance to test your patch. I originally expected to test it on a AT91SAM9263, but I did it now on a AT91SAM9X35. The tests were done on a v3.17-rc7 kernel + a DT patch.
If I only run my CAN burst test without any other load on the ARM everything works fine, on the unpatched kernel, with your patch and also with rx-fifo branch of https://gitorious.org/linux-can/linux-can-next.
When running an iperf (client on PC) in parallel, the situation is as follows:
unpatched kernel:
driver hangs after ~15s. No messages are received again while the kernel is still running.
your patch:
37346 of 500000 msg lost
rx-fifo:
36806 of 500000 msg lost

The CAN burst test:
This is done by 2 external embedded boards starting to sent CAN frames once they receive a start CAN frame from the ARM board. Each one sents frames at 1MBit/s including their own individual CAN ID with 4 data bytes containing a counter.
Every 200ms each device starts sending a burst of 250 frames. Using two devices ensures that there is no space bewteen messages. They are sent as fast as possible.
All received frames are evaluated on ARM for message losts and reorderings.

I can't say which implementation is actually better, read as how much is improved. But as the driver doesn't lockup at least one of those should be added.
A problem on AT91SAM9X5 for the performance is it only has 8 mailboxes, compared to 16 on 9263.

Best regards,
Alexander
Post by David Jander
Use an RX kfifo to empty receive message boxes as soon as possible in
the interrupt handler to avoid RX overruns if napi polls are late due to
latency.
---
drivers/net/can/at91_can.c | 100 ++++++++++++++++++++++++++++++++-------------
1 file changed, 71 insertions(+), 29 deletions(-)
diff --git a/drivers/net/can/at91_can.c b/drivers/net/can/at91_can.c
index 6ee1acd..1c53a44 100644
--- a/drivers/net/can/at91_can.c
+++ b/drivers/net/can/at91_can.c
@@ -24,6 +24,7 @@
#include <linux/if_arp.h>
#include <linux/interrupt.h>
#include <linux/kernel.h>
+#include <linux/kfifo.h>
#include <linux/module.h>
#include <linux/netdevice.h>
#include <linux/of.h>
@@ -153,6 +154,16 @@ struct at91_priv {
struct at91_can_data *pdata;
canid_t mb0_id;
+
+/*
+ * The AT91 SoC CAN controller (specially the one in some newer SoCs)
+ * has very little message boxes. On a busy high-speed network, latency
+ * may be too high for napi to catch up before RX overrun occurs.
+ * Therefor we declare a big enough kfifo and fill it directly from
+ * interrupt.
+ */
+#define RX_KFIFO_SIZE 512
+ DECLARE_KFIFO_PTR(rx_fifo, struct sk_buff *);
};
static const struct at91_devtype_data at91_at91sam9263_data = {
@@ -449,6 +460,26 @@ static void at91_chip_stop(struct net_device *dev, enum can_state state)
priv->can.state = state;
}
+static int at91_rx_fifo_in(struct net_device *dev, struct sk_buff *skb)
+{
+ struct at91_priv *priv = netdev_priv(dev);
+ unsigned int len = kfifo_put(&priv->rx_fifo, skb);
+
+ if (len == sizeof(skb))
+ return 0;
+ return -ENOMEM;
+}
+
+static int at91_rx_fifo_out(struct net_device *dev, struct sk_buff **skb)
+{
+ struct at91_priv *priv = netdev_priv(dev);
+ unsigned int len = kfifo_get(&priv->rx_fifo, skb);
+
+ if (len == sizeof(skb))
+ return 0;
+ return -ENOENT;
+}
+
/*
*
@@ -578,7 +609,7 @@ static void at91_rx_overflow_err(struct net_device *dev)
cf->can_id |= CAN_ERR_CRTL;
cf->data[1] = CAN_ERR_CRTL_RX_OVERFLOW;
- netif_receive_skb(skb);
+ at91_rx_fifo_in(dev, skb);
stats->rx_packets++;
stats->rx_bytes += cf->can_dlc;
@@ -643,7 +674,7 @@ static void at91_read_msg(struct net_device *dev, unsigned int mb)
}
at91_read_mb(dev, mb, cf);
- netif_receive_skb(skb);
+ at91_rx_fifo_in(dev, skb);
stats->rx_packets++;
stats->rx_bytes += cf->can_dlc;
@@ -700,7 +731,7 @@ static void at91_read_msg(struct net_device *dev, unsigned int mb)
* quota.
*
*/
-static int at91_poll_rx(struct net_device *dev, int quota)
+static int at91_poll_rx(struct net_device *dev)
{
struct at91_priv *priv = netdev_priv(dev);
u32 reg_sr = at91_read(priv, AT91_SR);
@@ -708,14 +739,9 @@ static int at91_poll_rx(struct net_device *dev, int quota)
unsigned int mb;
int received = 0;
- if (priv->rx_next > get_mb_rx_low_last(priv) &&
- reg_sr & get_mb_rx_low_mask(priv))
- netdev_info(dev,
- "order of incoming frames cannot be guaranteed\n");
-
for (mb = find_next_bit(addr, get_mb_tx_first(priv), priv->rx_next);
- mb < get_mb_tx_first(priv) && quota > 0;
+ mb < get_mb_tx_first(priv);
reg_sr = at91_read(priv, AT91_SR),
mb = find_next_bit(addr, get_mb_tx_first(priv), ++priv->rx_next)) {
at91_read_msg(dev, mb);
@@ -729,12 +755,11 @@ static int at91_poll_rx(struct net_device *dev, int quota)
at91_activate_rx_mb(priv, mb);
received++;
- quota--;
}
/* upper group completed, look again in lower */
if (priv->rx_next > get_mb_rx_low_last(priv) &&
- quota > 0 && mb > get_mb_rx_last(priv)) {
+ mb > get_mb_rx_last(priv)) {
priv->rx_next = get_mb_rx_first(priv);
goto again;
}
@@ -790,20 +815,17 @@ static void at91_poll_err_frame(struct net_device *dev,
}
}
-static int at91_poll_err(struct net_device *dev, int quota, u32 reg_sr)
+static int at91_poll_err(struct net_device *dev, u32 reg_sr)
{
struct sk_buff *skb;
struct can_frame *cf;
- if (quota == 0)
- return 0;
-
skb = alloc_can_err_skb(dev, &cf);
if (unlikely(!skb))
return 0;
at91_poll_err_frame(dev, cf, reg_sr);
- netif_receive_skb(skb);
+ at91_rx_fifo_in(dev, skb);
dev->stats.rx_packets++;
dev->stats.rx_bytes += cf->can_dlc;
@@ -811,15 +833,14 @@ static int at91_poll_err(struct net_device *dev, int quota, u32 reg_sr)
return 1;
}
-static int at91_poll(struct napi_struct *napi, int quota)
+static void at91_poll(struct net_device *dev)
{
- struct net_device *dev = napi->dev;
const struct at91_priv *priv = netdev_priv(dev);
u32 reg_sr = at91_read(priv, AT91_SR);
- int work_done = 0;
+ u32 reg_ier;
if (reg_sr & get_irq_mb_rx(priv))
- work_done += at91_poll_rx(dev, quota - work_done);
+ at91_poll_rx(dev);
/*
* The error bits are clear on read,
@@ -827,17 +848,30 @@ static int at91_poll(struct napi_struct *napi, int quota)
*/
reg_sr |= priv->reg_sr;
if (reg_sr & AT91_IRQ_ERR_FRAME)
- work_done += at91_poll_err(dev, quota - work_done, reg_sr);
+ at91_poll_err(dev, reg_sr);
- if (work_done < quota) {
- /* enable IRQs for frame errors and all mailboxes >= rx_next */
- u32 reg_ier = AT91_IRQ_ERR_FRAME;
- reg_ier |= get_irq_mb_rx(priv) & ~AT91_MB_MASK(priv->rx_next);
+ /* enable IRQs for frame errors and all mailboxes >= rx_next */
+ reg_ier = AT91_IRQ_ERR_FRAME;
+ reg_ier |= get_irq_mb_rx(priv) & ~AT91_MB_MASK(priv->rx_next);
+ at91_write(priv, AT91_IER, reg_ier);
+}
- napi_complete(napi);
- at91_write(priv, AT91_IER, reg_ier);
+static int at91_napi_poll(struct napi_struct *napi, int quota)
+{
+ struct net_device *dev = napi->dev;
+ const struct at91_priv *priv = netdev_priv(dev);
+ int work_done = 0;
+ struct sk_buff *skb = NULL;
+
+ while(!(kfifo_is_empty(&priv->rx_fifo)) && (work_done < quota)) {
+ at91_rx_fifo_out(dev, &skb);
+ netif_receive_skb(skb);
+ work_done ++;
}
+ if(work_done < quota)
+ napi_complete(napi);
+
return work_done;
}
@@ -1096,7 +1130,7 @@ static irqreturn_t at91_irq(int irq, void *dev_id)
handled = IRQ_HANDLED;
- /* Receive or error interrupt? -> napi */
+ /* Receive or error interrupt? -> put in rx_fifo and call napi */
if (reg_sr & (get_irq_mb_rx(priv) | AT91_IRQ_ERR_FRAME)) {
/*
* The error bits are clear on read,
@@ -1105,6 +1139,7 @@ static irqreturn_t at91_irq(int irq, void *dev_id)
priv->reg_sr = reg_sr;
at91_write(priv, AT91_IDR,
get_irq_mb_rx(priv) | AT91_IRQ_ERR_FRAME);
+ at91_poll(dev);
napi_schedule(&priv->napi);
}
@@ -1356,7 +1391,14 @@ static int at91_can_probe(struct platform_device *pdev)
priv->pdata = dev_get_platdata(&pdev->dev);
priv->mb0_id = 0x7ff;
- netif_napi_add(dev, &priv->napi, at91_poll, get_mb_rx_num(priv));
+ err = kfifo_alloc(&priv->rx_fifo, RX_KFIFO_SIZE, GFP_KERNEL);
+ if (err) {
+ dev_err(&pdev->dev, "allocating RX fifo failed\n");
+ goto exit_iounmap;
+ }
+
+ netif_napi_add(dev, &priv->napi, at91_napi_poll,
+ RX_KFIFO_SIZE > 64 ? 64 : RX_KFIFO_SIZE);
if (at91_is_sam9263(priv))
dev->sysfs_groups[0] = &at91_sysfs_attr_group;
--
Dipl.-Inf. Alexander Stein

SYS TEC electronic GmbH
Am Windrad 2
08468 Heinsdorfergrund
Tel.: 03765 38600-1156
Fax: 03765 38600-4100
Email: ***@systec-electronic.com
Website: www.systec-electronic.com

Managing Director: Dipl.-Phys. Siegmar Schmidt
Commercial registry: Amtsgericht Chemnitz, HRB 28082

--
To unsubscribe from this list: send the line "unsubscribe linux-can" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
David Jander
2014-10-03 09:01:41 UTC
Permalink
Dear Alexander,

On Thu, 02 Oct 2014 14:41:25 +0200
Post by Alexander Stein
finally I got the chance to test your patch. I originally expected to test
it on a AT91SAM9263, but I did it now on a AT91SAM9X35. The tests were done
on a v3.17-rc7 kernel + a DT patch. If I only run my CAN burst test without
any other load on the ARM everything works fine, on the unpatched kernel,
with your patch and also with rx-fifo branch of
https://gitorious.org/linux-can/linux-can-next. When running an iperf
driver hangs after ~15s. No messages are received again while the kernel is
still running. your patch: 37346 of 500000 msg lost rx-fifo: 36806 of 500000
msg lost
Thanks a lot for taking the time to look at this.
I just looked at the rx-fifo patch, but I still don't understand how it is
supposed to improve the situation of this driver.... beats me.
Nevertheless you just proved that it is at least as good as my patch.
AFAIK, there is nothing that should work as well as off-loading the CAN
controller in the IRQ handler by a far margin. But the rx-fifo patch does not
do that, so it is hard for me to believe it is really that good.
Could you repeat your test at a lower bitrate? The only thing I can think of
is that 37000 out of 500000 messages the latency has spiked on your system,
but that spike should be a lot more contained with my patch than with rx-fifo,
so if I'm right, then lowering the bitrate we might see a situation in which
rx-fifo still loses a message here and there, while my patch doesn't.
Other than that, I am tempted to think my patch is simply broken.

Or, maybe Marc can explain what he did... because I don't really get it :-(
Post by Alexander Stein
This is done by 2 external embedded boards starting to sent CAN frames once
they receive a start CAN frame from the ARM board. Each one sents frames at
1MBit/s including their own individual CAN ID with 4 data bytes containing a
counter. Every 200ms each device starts sending a burst of 250 frames. Using
two devices ensures that there is no space bewteen messages. They are sent
as fast as possible. All received frames are evaluated on ARM for message
losts and reorderings.
I can't say which implementation is actually better, read as how much is
improved. But as the driver doesn't lockup at least one of those should be
added. A problem on AT91SAM9X5 for the performance is it only has 8
mailboxes, compared to 16 on 9263.
Yes, it seems Atmel is going the other way around compared to Freescale, they
are making newer implementations actually more crippled than older ones :-(

Best regards,
--
David Jander
Protonic Holland.
--
To unsubscribe from this list: send the line "unsubscribe linux-can" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Alexander Stein
2014-10-06 08:52:05 UTC
Permalink
Hello David,
Post by David Jander
On Thu, 02 Oct 2014 14:41:25 +0200
Post by Alexander Stein
finally I got the chance to test your patch. I originally expected to test
it on a AT91SAM9263, but I did it now on a AT91SAM9X35. The tests were done
on a v3.17-rc7 kernel + a DT patch. If I only run my CAN burst test without
any other load on the ARM everything works fine, on the unpatched kernel,
with your patch and also with rx-fifo branch of
https://gitorious.org/linux-can/linux-can-next. When running an iperf
driver hangs after ~15s. No messages are received again while the kernel is
still running. your patch: 37346 of 500000 msg lost rx-fifo: 36806 of 500000
msg lost
Thanks a lot for taking the time to look at this.
I just looked at the rx-fifo patch, but I still don't understand how it is
supposed to improve the situation of this driver.... beats me.
Nevertheless you just proved that it is at least as good as my patch.
AFAIK, there is nothing that should work as well as off-loading the CAN
controller in the IRQ handler by a far margin. But the rx-fifo patch does not
do that, so it is hard for me to believe it is really that good.
Could you repeat your test at a lower bitrate? The only thing I can think of
is that 37000 out of 500000 messages the latency has spiked on your system,
but that spike should be a lot more contained with my patch than with rx-fifo,
so if I'm right, then lowering the bitrate we might see a situation in which
rx-fifo still loses a message here and there, while my patch doesn't.
Other than that, I am tempted to think my patch is simply broken.
Ok, here is another test run (including iperf) at 250kBit/s. Did all tests 3 times.
plain: 0, 2, lockup
your patch: 0, 0, 0
rx-fifo: 26, 0, 43

When the plain driver lockups I see those kernel messages:
at91_can f8004000.can can0: order of incoming frames cannot be guaranteed

And the same with 500kBit/s:
plain: 0, 0, lockup
your patch: 0, 0, 0
rx-fifo: 0, 0, 0

I'm wondering why here rx-fifo doesn't raise any misses at 500kBit/s while there were some at 250kBit/s, maybe I just got lucky to detect it then... I get the impression that iperf influence is somewhat different from time to time. For this reason I redid the test at 1MBit/s:
plain: lockup, lockup, lockup
your patch: 37904, 36436, 37570
rx-fifo: 35626, 34018, 36451

As expected all variants lost frames while with the unmodified kernel the driver lockups.

I only have firmware for the embedded devices which support those bitrates tested, so I can't run again with e.g. 800kBit/s, but it shows that your patch improves the situation a lot. No more driver lockups, and no message losts at at least smaller bitrates.

Best regards,
Alexander
--
Dipl.-Inf. Alexander Stein

SYS TEC electronic GmbH
Am Windrad 2
08468 Heinsdorfergrund
Tel.: 03765 38600-1156
Fax: 03765 38600-4100
Email: ***@systec-electronic.com
Website: www.systec-electronic.com

Managing Director: Dipl.-Phys. Siegmar Schmidt
Commercial registry: Amtsgericht Chemnitz, HRB 28082

--
To unsubscribe from this list: send the line "unsubscribe linux-can" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
David Jander
2014-10-06 09:26:44 UTC
Permalink
Hi Alexander,

On Mon, 06 Oct 2014 10:52:05 +0200
Post by Alexander Stein
Hello David,
Post by David Jander
On Thu, 02 Oct 2014 14:41:25 +0200
Post by Alexander Stein
finally I got the chance to test your patch. I originally expected to
test it on a AT91SAM9263, but I did it now on a AT91SAM9X35. The tests
were done on a v3.17-rc7 kernel + a DT patch. If I only run my CAN burst
test without any other load on the ARM everything works fine, on the
unpatched kernel, with your patch and also with rx-fifo branch of
https://gitorious.org/linux-can/linux-can-next. When running an iperf
(client on PC) in parallel, the situation is as follows: unpatched
kernel: driver hangs after ~15s. No messages are received again while
the kernel is still running. your patch: 37346 of 500000 msg lost
rx-fifo: 36806 of 500000 msg lost
Thanks a lot for taking the time to look at this.
I just looked at the rx-fifo patch, but I still don't understand how it is
supposed to improve the situation of this driver.... beats me.
Nevertheless you just proved that it is at least as good as my patch.
AFAIK, there is nothing that should work as well as off-loading the CAN
controller in the IRQ handler by a far margin. But the rx-fifo patch does
not do that, so it is hard for me to believe it is really that good.
Could you repeat your test at a lower bitrate? The only thing I can think
of is that 37000 out of 500000 messages the latency has spiked on your
system, but that spike should be a lot more contained with my patch than
with rx-fifo, so if I'm right, then lowering the bitrate we might see a
situation in which rx-fifo still loses a message here and there, while my
patch doesn't. Other than that, I am tempted to think my patch is simply
broken.
Ok, here is another test run (including iperf) at 250kBit/s. Did all tests 3
times. plain: 0, 2, lockup
your patch: 0, 0, 0
rx-fifo: 26, 0, 43
Ok, this confirms what I suspected... latency-peaks are more contained when
emptying the CAN controller in the interrupt handler.
Post by Alexander Stein
at91_can f8004000.can can0: order of incoming frames cannot be guaranteed
plain: 0, 0, lockup
your patch: 0, 0, 0
rx-fifo: 0, 0, 0
This is weird. Either you were lucky, your embedded devices aren't able to
send back-to-back at that rate specifically, or the situation regarding load
and latency spikes changed somehow. The results don't make sense to me.
One interesting control-metric would be to monitor the amount of
messages/second your test-devices are able to generate.
Post by Alexander Stein
I'm wondering why here rx-fifo doesn't raise any misses at 500kBit/s while
there were some at 250kBit/s, maybe I just got lucky to detect it then... I
get the impression that iperf influence is somewhat different from time to
plain: lockup, lockup, lockup
your patch: 37904, 36436, 37570
rx-fifo: 35626, 34018, 36451
Maybe this case can still be improved on my patch through some optimizations,
but obviously the driver and/or controller are not well suited for such high
bitrates.
Post by Alexander Stein
As expected all variants lost frames while with the unmodified kernel the driver lockups.
I only have firmware for the embedded devices which support those bitrates
tested, so I can't run again with e.g. 800kBit/s, but it shows that your
patch improves the situation a lot. No more driver lockups, and no message
losts at at least smaller bitrates.
That was the purpose of the patch. Somehow I feel that there's still a lot of
room for improvement. I would like to see Marc's rx-fifo work enhanced and
base both the at91_can- and flexcan improvements on that. Unfortunately I have
very little time to dedicate to this, and no hardware for testing/debugging
at91_can anymore.

Thanks a lot for taking the time to test!

Best regards,
--
David Jander
Protonic Holland.
--
To unsubscribe from this list: send the line "unsubscribe linux-can" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Alexander Stein
2014-10-06 11:21:22 UTC
Permalink
Post by David Jander
Post by Alexander Stein
Hello David,
Post by David Jander
On Thu, 02 Oct 2014 14:41:25 +0200
Post by Alexander Stein
finally I got the chance to test your patch. I originally expected to
test it on a AT91SAM9263, but I did it now on a AT91SAM9X35. The tests
were done on a v3.17-rc7 kernel + a DT patch. If I only run my CAN burst
test without any other load on the ARM everything works fine, on the
unpatched kernel, with your patch and also with rx-fifo branch of
https://gitorious.org/linux-can/linux-can-next. When running an iperf
(client on PC) in parallel, the situation is as follows: unpatched
kernel: driver hangs after ~15s. No messages are received again while
the kernel is still running. your patch: 37346 of 500000 msg lost
rx-fifo: 36806 of 500000 msg lost
Thanks a lot for taking the time to look at this.
I just looked at the rx-fifo patch, but I still don't understand how it is
supposed to improve the situation of this driver.... beats me.
Nevertheless you just proved that it is at least as good as my patch.
AFAIK, there is nothing that should work as well as off-loading the CAN
controller in the IRQ handler by a far margin. But the rx-fifo patch does
not do that, so it is hard for me to believe it is really that good.
Could you repeat your test at a lower bitrate? The only thing I can think
of is that 37000 out of 500000 messages the latency has spiked on your
system, but that spike should be a lot more contained with my patch than
with rx-fifo, so if I'm right, then lowering the bitrate we might see a
situation in which rx-fifo still loses a message here and there, while my
patch doesn't. Other than that, I am tempted to think my patch is simply
broken.
Ok, here is another test run (including iperf) at 250kBit/s. Did all tests 3
times. plain: 0, 2, lockup
your patch: 0, 0, 0
rx-fifo: 26, 0, 43
Ok, this confirms what I suspected... latency-peaks are more contained when
emptying the CAN controller in the interrupt handler.
Post by Alexander Stein
at91_can f8004000.can can0: order of incoming frames cannot be guaranteed
plain: 0, 0, lockup
your patch: 0, 0, 0
rx-fifo: 0, 0, 0
This is weird. Either you were lucky, your embedded devices aren't able to
send back-to-back at that rate specifically, or the situation regarding load
and latency spikes changed somehow. The results don't make sense to me.
Well, I guess this will change if I would run more than 3 times, but as overruns already occured at 250kBit/s there _is_ still a problem in rx-fifo, independently from 1MBit/s drops due to heavy load.
Post by David Jander
One interesting control-metric would be to monitor the amount of
messages/second your test-devices are able to generate.
I just noticed that this testing hardware has a DDR2 with only 16bit interface. I think this will also reduce performance considerably.
The embedded device send ~1000 CAN frames/s, each which is an average busload of 20%, but in burst time, it should be 100%.

Best regards,
Alexander
--
Dipl.-Inf. Alexander Stein

SYS TEC electronic GmbH
Am Windrad 2
08468 Heinsdorfergrund
Tel.: 03765 38600-1156
Fax: 03765 38600-4100
Email: ***@systec-electronic.com
Website: www.systec-electronic.com

Managing Director: Dipl.-Phys. Siegmar Schmidt
Commercial registry: Amtsgericht Chemnitz, HRB 28082

--
To unsubscribe from this list: send the line "unsubscribe linux-can" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
David Jander
2014-10-06 11:39:42 UTC
Permalink
On Mon, 06 Oct 2014 13:21:22 +0200
Post by Alexander Stein
Post by David Jander
Post by Alexander Stein
Hello David,
Post by David Jander
On Thu, 02 Oct 2014 14:41:25 +0200
Post by Alexander Stein
finally I got the chance to test your patch. I originally expected to
test it on a AT91SAM9263, but I did it now on a AT91SAM9X35. The
tests were done on a v3.17-rc7 kernel + a DT patch. If I only run my
CAN burst test without any other load on the ARM everything works
fine, on the unpatched kernel, with your patch and also with rx-fifo
branch of https://gitorious.org/linux-can/linux-can-next. When
running an iperf (client on PC) in parallel, the situation is as
follows: unpatched kernel: driver hangs after ~15s. No messages are
received again while the kernel is still running. your patch: 37346
of 500000 msg lost rx-fifo: 36806 of 500000 msg lost
Thanks a lot for taking the time to look at this.
I just looked at the rx-fifo patch, but I still don't understand how
it is supposed to improve the situation of this driver.... beats me.
Nevertheless you just proved that it is at least as good as my patch.
AFAIK, there is nothing that should work as well as off-loading the CAN
controller in the IRQ handler by a far margin. But the rx-fifo patch
does not do that, so it is hard for me to believe it is really that
good. Could you repeat your test at a lower bitrate? The only thing I
can think of is that 37000 out of 500000 messages the latency has
spiked on your system, but that spike should be a lot more contained
with my patch than with rx-fifo, so if I'm right, then lowering the
bitrate we might see a situation in which rx-fifo still loses a
message here and there, while my patch doesn't. Other than that, I am
tempted to think my patch is simply broken.
Ok, here is another test run (including iperf) at 250kBit/s. Did all
tests 3 times. plain: 0, 2, lockup
your patch: 0, 0, 0
rx-fifo: 26, 0, 43
Ok, this confirms what I suspected... latency-peaks are more contained when
emptying the CAN controller in the interrupt handler.
Post by Alexander Stein
at91_can f8004000.can can0: order of incoming frames cannot be guaranteed
plain: 0, 0, lockup
your patch: 0, 0, 0
rx-fifo: 0, 0, 0
This is weird. Either you were lucky, your embedded devices aren't able to
send back-to-back at that rate specifically, or the situation regarding
load and latency spikes changed somehow. The results don't make sense to
me.
Well, I guess this will change if I would run more than 3 times, but as
overruns already occured at 250kBit/s there _is_ still a problem in rx-fifo,
independently from 1MBit/s drops due to heavy load.
Well, I now think that rx-fifo was never intended to improve the driver
performance (correct me if I'm wrong, Marc), but only to build a common
subsystem around the same concept that seems to be re-invented in at91_can,
flexcan and ti_hecc. It does fix the lockup-bug in at91_can though.
Post by Alexander Stein
Post by David Jander
One interesting control-metric would be to monitor the amount of
messages/second your test-devices are able to generate.
I just noticed that this testing hardware has a DDR2 with only 16bit
interface. I think this will also reduce performance considerably. The
16-bit DDR2 (even at its lowest clock of 200MHz) is not exactly slow... why do
you think that could be a bottleneck?
Post by Alexander Stein
embedded device send ~1000 CAN frames/s, each which is an average busload of
20%, but in burst time, it should be 100%.
You mean the bursts of 250 messages you talked about earlier should produce a
bus load of 100%? I think it is important to get some certainty about what's
really going on on the bus, specially if we see things we cannot explain. You
don't have access to a PC with a CAN interface or an oscilloscope, do you?

Best regards,
--
David Jander
Protonic Holland.
--
To unsubscribe from this list: send the line "unsubscribe linux-can" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Marc Kleine-Budde
2014-10-06 12:52:33 UTC
Permalink
On 10/06/2014 01:39 PM, David Jander wrote:
[...]
Post by David Jander
Post by Alexander Stein
Post by David Jander
Post by Alexander Stein
plain: 0, 0, lockup
your patch: 0, 0, 0
rx-fifo: 0, 0, 0
This is weird. Either you were lucky, your embedded devices aren't able to
send back-to-back at that rate specifically, or the situation regarding
load and latency spikes changed somehow. The results don't make sense to
me.
Well, I guess this will change if I would run more than 3 times, but as
overruns already occured at 250kBit/s there _is_ still a problem in rx-fifo,
independently from 1MBit/s drops due to heavy load.
Well, I now think that rx-fifo was never intended to improve the driver
performance (correct me if I'm wrong, Marc), but only to build a common
subsystem around the same concept that seems to be re-invented in at91_can,
flexcan and ti_hecc. It does fix the lockup-bug in at91_can though.
Exactly. It's supposed to abstract the make-a-fifo code from the
hardware and move it into the CAN driver core.

Marc
--
Pengutronix e.K. | Marc Kleine-Budde |
Industrial Linux Solutions | Phone: +49-231-2826-924 |
Vertretung West/Dortmund | Fax: +49-5121-206917-5555 |
Amtsgericht Hildesheim, HRA 2686 | http://www.pengutronix.de |
Alexander Stein
2014-10-06 14:14:05 UTC
Permalink
Post by David Jander
Post by Alexander Stein
Post by David Jander
One interesting control-metric would be to monitor the amount of
messages/second your test-devices are able to generate.
I just noticed that this testing hardware has a DDR2 with only 16bit
interface. I think this will also reduce performance considerably. The
16-bit DDR2 (even at its lowest clock of 200MHz) is not exactly slow... why do
you think that could be a bottleneck?
It's a 133MHz clock, so 266MHz effective.
I had done similar tests with i.MX28-EVK which also has only a 16-bit DDR interface. The rusults were horrible. Even without that ethernet bug...
Now, doing the same tests (also CPU and Memory) again, it is worse than a AT91SAM9G20 with 32-bit SDR RAM, which also has 400 MHz. I expected both would be equally.
If you're correct and the RAM is not the bottleneck it seems that the cache (16kByte each, 32kByte on AT91SAM9G20) would be it.
Post by David Jander
Post by Alexander Stein
embedded device send ~1000 CAN frames/s, each which is an average busload of
20%, but in burst time, it should be 100%.
You mean the bursts of 250 messages you talked about earlier should produce a
bus load of 100%? I think it is important to get some certainty about what's
really going on on the bus, specially if we see things we cannot explain. You
don't have access to a PC with a CAN interface or an oscilloscope, do you?
My PC has our USB-CAN-Modules attached (single or dual channel), so what do you want to see? a candump log?
My access to oscilloscopes rather limited and not reliable.

Best regards,
Alexander
--
Dipl.-Inf. Alexander Stein

SYS TEC electronic GmbH
Am Windrad 2
08468 Heinsdorfergrund
Tel.: 03765 38600-1156
Fax: 03765 38600-4100
Email: ***@systec-electronic.com
Website: www.systec-electronic.com

Managing Director: Dipl.-Phys. Siegmar Schmidt
Commercial registry: Amtsgericht Chemnitz, HRB 28082

--
To unsubscribe from this list: send the line "unsubscribe linux-can" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
David Jander
2014-10-07 08:31:48 UTC
Permalink
On Mon, 06 Oct 2014 16:14:05 +0200
Post by Alexander Stein
Post by David Jander
Post by Alexander Stein
Post by David Jander
One interesting control-metric would be to monitor the amount of
messages/second your test-devices are able to generate.
I just noticed that this testing hardware has a DDR2 with only 16bit
interface. I think this will also reduce performance considerably. The
16-bit DDR2 (even at its lowest clock of 200MHz) is not exactly slow...
why do you think that could be a bottleneck?
It's a 133MHz clock, so 266MHz effective.
Sorry, my fault. The lowest permitted DDR2 clock is 125MHz.
Post by Alexander Stein
I had done similar tests with i.MX28-EVK which also has only a 16-bit DDR
interface. The rusults were horrible. Even without that ethernet bug...
Horrible seems a bit of a stretch... I have an i.MX28 board on my desk (with
16-bit 200MHz DDR2), which is capable of spewing out 12200 messages per second
at 1Mbaud when idle. The theoretical maximum bus load (DLC=1) would be around
20000 messages per second, so at least I am able to keep up with the bandwidth
corresponding to a 500kbaud bus at 100% load.
From my PC (with a Peak USB CAN dongle) I can send only 5200 messages per
second (and this is a Core-i7 with 128bit DDR3 memory interface clocked at
800MHz ;-), so if the interfaces from SYS-Tec are significantly better than
that we might have a deal... ;-)
Post by Alexander Stein
Now, doing the same tests (also CPU and Memory) again, it is worse than a
AT91SAM9G20 with 32-bit SDR RAM, which also has 400 MHz. I expected both
would be equally. If you're correct and the RAM is not the bottleneck it
seems that the cache (16kByte each, 32kByte on AT91SAM9G20) would be it.
The i.MX28 and the AT91SAM9 have totally different CAN controllers to start
with, so it is more like comparing apples with oranges I guess.
Post by Alexander Stein
Post by David Jander
Post by Alexander Stein
embedded device send ~1000 CAN frames/s, each which is an average
busload of 20%, but in burst time, it should be 100%.
You mean the bursts of 250 messages you talked about earlier should
produce a bus load of 100%? I think it is important to get some certainty
about what's really going on on the bus, specially if we see things we
cannot explain. You don't have access to a PC with a CAN interface or an
oscilloscope, do you?
My PC has our USB-CAN-Modules attached (single or dual channel), so what do
you want to see? a candump log? My access to oscilloscopes rather limited
and not reliable.
You can try this (which is what I do):

$ time cangen -g 0 -I 123 -L 1 -D i -p 2 -n 10000 can0

If you tx-queue is smallish compared to the 10000 messages sent, then the
"real" time reported by "time" will correspond approximately to the amount of
time it took to actually transmit 10000 messages.

Best regards,
--
David Jander
Protonic Holland.
--
To unsubscribe from this list: send the line "unsubscribe linux-can" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Alexander Stein
2014-10-07 11:36:42 UTC
Permalink
Post by David Jander
Post by Alexander Stein
I had done similar tests with i.MX28-EVK which also has only a 16-bit DDR
interface. The rusults were horrible. Even without that ethernet bug...
Horrible seems a bit of a stretch... I have an i.MX28 board on my desk (with
16-bit 200MHz DDR2), which is capable of spewing out 12200 messages per second
at 1Mbaud when idle. The theoretical maximum bus load (DLC=1) would be around
20000 messages per second, so at least I am able to keep up with the bandwidth
corresponding to a 500kbaud bus at 100% load.
I wasn't refering to CAN handling, more number cunching and memory access.
Post by David Jander
From my PC (with a Peak USB CAN dongle) I can send only 5200 messages per
second (and this is a Core-i7 with 128bit DDR3 memory interface clocked at
800MHz ;-), so if the interfaces from SYS-Tec are significantly better than
that we might have a deal... ;-)
When using a dual channel interface where can0 is connected with can1: If I run 'cangen -L 1 -g 0 -i can1', 'canbusload ***@1000000 -r -t -b -c -e' shows
***@1000000 9643 549790 77144 54% |XXXXXXXXXX..........|
which would mean ~9600 frames/s.
Post by David Jander
Post by Alexander Stein
Post by David Jander
Post by Alexander Stein
embedded device send ~1000 CAN frames/s, each which is an average
busload of 20%, but in burst time, it should be 100%.
You mean the bursts of 250 messages you talked about earlier should
produce a bus load of 100%? I think it is important to get some certainty
about what's really going on on the bus, specially if we see things we
cannot explain. You don't have access to a PC with a CAN interface or an
oscilloscope, do you?
My PC has our USB-CAN-Modules attached (single or dual channel), so what do
you want to see? a candump log? My access to oscilloscopes rather limited
and not reliable.
$ time cangen -g 0 -I 123 -L 1 -D i -p 2 -n 10000 can0
If you tx-queue is smallish compared to the 10000 messages sent, then the
"real" time reported by "time" will correspond approximately to the amount of
time it took to actually transmit 10000 messages.
% time cangen -g 0 -I 123 -L 1 -D i -p 2 -n 10000 can0
cangen -g 0 -I 123 -L 1 -D i -p 2 -n 10000 can0 0,15s user 0,81s system 99% cpu 0,960 tota

This seem to match the output of canbusload.

Best regards,
Alexander
--
Dipl.-Inf. Alexander Stein

SYS TEC electronic GmbH
Am Windrad 2
08468 Heinsdorfergrund
Tel.: 03765 38600-1156
Fax: 03765 38600-4100
Email: ***@systec-electronic.com
Website: www.systec-electronic.com

Managing Director: Dipl.-Phys. Siegmar Schmidt
Commercial registry: Amtsgericht Chemnitz, HRB 28082

--
To unsubscribe from this list: send the line "unsubscribe linux-can" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Loading...