Commit 086e9dc0 authored by James Hogan's avatar James Hogan
Browse files

metag: Optimised library functions



Add optimised library functions for metag.
Signed-off-by: default avatarJames Hogan <james.hogan@imgtec.com>
parent f507758c
#ifndef _METAG_CHECKSUM_H
#define _METAG_CHECKSUM_H
/*
* computes the checksum of a memory block at buff, length len,
* and adds in "sum" (32-bit)
*
* returns a 32-bit number suitable for feeding into itself
* or csum_tcpudp_magic
*
* this function must be called with even lengths, except
* for the last fragment, which may be odd
*
* it's best to have buff aligned on a 32-bit boundary
*/
extern __wsum csum_partial(const void *buff, int len, __wsum sum);
/*
* the same as csum_partial, but copies from src while it
* checksums
*
* here even more important to align src and dst on a 32-bit (or even
* better 64-bit) boundary
*/
extern __wsum csum_partial_copy(const void *src, void *dst, int len,
__wsum sum);
/*
* the same as csum_partial_copy, but copies from user space.
*
* here even more important to align src and dst on a 32-bit (or even
* better 64-bit) boundary
*/
extern __wsum csum_partial_copy_from_user(const void __user *src, void *dst,
int len, __wsum sum, int *csum_err);
#define csum_partial_copy_nocheck(src, dst, len, sum) \
csum_partial_copy((src), (dst), (len), (sum))
/*
* Fold a partial checksum
*/
static inline __sum16 csum_fold(__wsum csum)
{
u32 sum = (__force u32)csum;
sum = (sum & 0xffff) + (sum >> 16);
sum = (sum & 0xffff) + (sum >> 16);
return (__force __sum16)~sum;
}
/*
* This is a version of ip_compute_csum() optimized for IP headers,
* which always checksum on 4 octet boundaries.
*/
extern __sum16 ip_fast_csum(const void *iph, unsigned int ihl);
/*
* computes the checksum of the TCP/UDP pseudo-header
* returns a 16-bit checksum, already complemented
*/
static inline __wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr,
unsigned short len,
unsigned short proto,
__wsum sum)
{
unsigned long len_proto = (proto + len) << 8;
asm ("ADD %0, %0, %1\n"
"ADDS %0, %0, %2\n"
"ADDCS %0, %0, #1\n"
"ADDS %0, %0, %3\n"
"ADDCS %0, %0, #1\n"
: "=d" (sum)
: "d" (daddr), "d" (saddr), "d" (len_proto),
"0" (sum)
: "cc");
return sum;
}
static inline __sum16
csum_tcpudp_magic(__be32 saddr, __be32 daddr, unsigned short len,
unsigned short proto, __wsum sum)
{
return csum_fold(csum_tcpudp_nofold(saddr, daddr, len, proto, sum));
}
/*
* this routine is used for miscellaneous IP-like checksums, mainly
* in icmp.c
*/
extern __sum16 ip_compute_csum(const void *buff, int len);
#endif /* _METAG_CHECKSUM_H */
#ifndef __ASM_DIV64_H__
#define __ASM_DIV64_H__
#include <asm-generic/div64.h>
extern u64 div_u64(u64 dividend, u64 divisor);
extern s64 div_s64(s64 dividend, s64 divisor);
#define div_u64 div_u64
#define div_s64 div_s64
#endif
#ifndef _METAG_STRING_H_
#define _METAG_STRING_H_
#define __HAVE_ARCH_MEMSET
extern void *memset(void *__s, int __c, size_t __count);
#define __HAVE_ARCH_MEMCPY
void *memcpy(void *__to, __const__ void *__from, size_t __n);
#define __HAVE_ARCH_MEMMOVE
extern void *memmove(void *__dest, __const__ void *__src, size_t __n);
#endif /* _METAG_STRING_H_ */
! Copyright (C) 2012 by Imagination Technologies Ltd.
!
! 64-bit arithmetic shift left routine.
!
.text
.global ___ashldi3
.type ___ashldi3,function
___ashldi3:
MOV D0Re0,D0Ar2
MOV D1Re0,D1Ar1
CMP D1Ar3,#0 ! COUNT == 0
MOVEQ PC,D1RtP ! Yes, return
SUBS D0Ar4,D1Ar3,#32 ! N = COUNT - 32
BGE $L10
!! Shift < 32
NEG D0Ar4,D0Ar4 ! N = - N
LSL D1Re0,D1Re0,D1Ar3 ! HI = HI << COUNT
LSR D0Ar6,D0Re0,D0Ar4 ! TMP= LO >> -(COUNT - 32)
OR D1Re0,D1Re0,D0Ar6 ! HI = HI | TMP
SWAP D0Ar4,D1Ar3
LSL D0Re0,D0Re0,D0Ar4 ! LO = LO << COUNT
MOV PC,D1RtP
$L10:
!! Shift >= 32
LSL D1Re0,D0Re0,D0Ar4 ! HI = LO << N
MOV D0Re0,#0 ! LO = 0
MOV PC,D1RtP
.size ___ashldi3,.-___ashldi3
! Copyright (C) 2012 by Imagination Technologies Ltd.
!
! 64-bit arithmetic shift right routine.
!
.text
.global ___ashrdi3
.type ___ashrdi3,function
___ashrdi3:
MOV D0Re0,D0Ar2
MOV D1Re0,D1Ar1
CMP D1Ar3,#0 ! COUNT == 0
MOVEQ PC,D1RtP ! Yes, return
MOV D0Ar4,D1Ar3
SUBS D1Ar3,D1Ar3,#32 ! N = COUNT - 32
BGE $L20
!! Shift < 32
NEG D1Ar3,D1Ar3 ! N = - N
LSR D0Re0,D0Re0,D0Ar4 ! LO = LO >> COUNT
LSL D0Ar6,D1Re0,D1Ar3 ! TMP= HI << -(COUNT - 32)
OR D0Re0,D0Re0,D0Ar6 ! LO = LO | TMP
SWAP D1Ar3,D0Ar4
ASR D1Re0,D1Re0,D1Ar3 ! HI = HI >> COUNT
MOV PC,D1RtP
$L20:
!! Shift >= 32
ASR D0Re0,D1Re0,D1Ar3 ! LO = HI >> N
ASR D1Re0,D1Re0,#31 ! HI = HI >> 31
MOV PC,D1RtP
.size ___ashrdi3,.-___ashrdi3
/*
*
* INET An implementation of the TCP/IP protocol suite for the LINUX
* operating system. INET is implemented using the BSD Socket
* interface as the means of communication with the user level.
*
* IP/TCP/UDP checksumming routines
*
* Authors: Jorge Cwik, <jorge@laser.satlink.net>
* Arnt Gulbrandsen, <agulbra@nvg.unit.no>
* Tom May, <ftom@netcom.com>
* Andreas Schwab, <schwab@issan.informatik.uni-dortmund.de>
* Lots of code moved from tcp.c and ip.c; see those files
* for more names.
*
* 03/02/96 Jes Sorensen, Andreas Schwab, Roman Hodek:
* Fixed some nasty bugs, causing some horrible crashes.
* A: At some points, the sum (%0) was used as
* length-counter instead of the length counter
* (%1). Thanks to Roman Hodek for pointing this out.
* B: GCC seems to mess up if one uses too many
* data-registers to hold input values and one tries to
* specify d0 and d1 as scratch registers. Letting gcc
* choose these registers itself solves the problem.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/
/* Revised by Kenneth Albanowski for m68knommu. Basic problem: unaligned access
kills, so most of the assembly has to go. */
#include <linux/module.h>
#include <net/checksum.h>
#include <asm/byteorder.h>
static inline unsigned short from32to16(unsigned int x)
{
/* add up 16-bit and 16-bit for 16+c bit */
x = (x & 0xffff) + (x >> 16);
/* add up carry.. */
x = (x & 0xffff) + (x >> 16);
return x;
}
static unsigned int do_csum(const unsigned char *buff, int len)
{
int odd;
unsigned int result = 0;
if (len <= 0)
goto out;
odd = 1 & (unsigned long) buff;
if (odd) {
#ifdef __LITTLE_ENDIAN
result += (*buff << 8);
#else
result = *buff;
#endif
len--;
buff++;
}
if (len >= 2) {
if (2 & (unsigned long) buff) {
result += *(unsigned short *) buff;
len -= 2;
buff += 2;
}
if (len >= 4) {
const unsigned char *end = buff + ((unsigned)len & ~3);
unsigned int carry = 0;
do {
unsigned int w = *(unsigned int *) buff;
buff += 4;
result += carry;
result += w;
carry = (w > result);
} while (buff < end);
result += carry;
result = (result & 0xffff) + (result >> 16);
}
if (len & 2) {
result += *(unsigned short *) buff;
buff += 2;
}
}
if (len & 1)
#ifdef __LITTLE_ENDIAN
result += *buff;
#else
result += (*buff << 8);
#endif
result = from32to16(result);
if (odd)
result = ((result >> 8) & 0xff) | ((result & 0xff) << 8);
out:
return result;
}
EXPORT_SYMBOL(ip_fast_csum);
/*
* computes the checksum of a memory block at buff, length len,
* and adds in "sum" (32-bit)
*
* returns a 32-bit number suitable for feeding into itself
* or csum_tcpudp_magic
*
* this function must be called with even lengths, except
* for the last fragment, which may be odd
*
* it's best to have buff aligned on a 32-bit boundary
*/
__wsum csum_partial(const void *buff, int len, __wsum wsum)
{
unsigned int sum = (__force unsigned int)wsum;
unsigned int result = do_csum(buff, len);
/* add in old sum, and carry.. */
result += sum;
if (sum > result)
result += 1;
return (__force __wsum)result;
}
EXPORT_SYMBOL(csum_partial);
/*
* this routine is used for miscellaneous IP-like checksums, mainly
* in icmp.c
*/
__sum16 ip_compute_csum(const void *buff, int len)
{
return (__force __sum16)~do_csum(buff, len);
}
EXPORT_SYMBOL(ip_compute_csum);
/*
* copy from fs while checksumming, otherwise like csum_partial
*/
__wsum
csum_partial_copy_from_user(const void __user *src, void *dst, int len,
__wsum sum, int *csum_err)
{
int missing;
missing = __copy_from_user(dst, src, len);
if (missing) {
memset(dst + len - missing, 0, missing);
*csum_err = -EFAULT;
} else
*csum_err = 0;
return csum_partial(dst, len, sum);
}
EXPORT_SYMBOL(csum_partial_copy_from_user);
/*
* copy from ds while checksumming, otherwise like csum_partial
*/
__wsum
csum_partial_copy(const void *src, void *dst, int len, __wsum sum)
{
memcpy(dst, src, len);
return csum_partial(dst, len, sum);
}
EXPORT_SYMBOL(csum_partial_copy);
! Copyright 2007,2008,2009 Imagination Technologies Ltd.
#include <asm/page.h>
.text
.global _clear_page
.type _clear_page,function
!! D1Ar1 - page
_clear_page:
MOV TXRPT,#((PAGE_SIZE / 8) - 1)
MOV D0Re0,#0
MOV D1Re0,#0
$Lclear_page_loop:
SETL [D1Ar1++],D0Re0,D1Re0
BR $Lclear_page_loop
MOV PC,D1RtP
.size _clear_page,.-_clear_page
! Copyright (C) 2012 by Imagination Technologies Ltd.
!
! 64-bit signed compare routine.
!
.text
.global ___cmpdi2
.type ___cmpdi2,function
! low high
! s64 a (D0Ar2, D1Ar1)
! s64 b (D0Ar4, D1Ar3)
___cmpdi2:
! start at 1 (equal) and conditionally increment or decrement
MOV D0Re0,#1
! high words differ?
CMP D1Ar1,D1Ar3
BNE $Lhigh_differ
! unsigned compare low words
CMP D0Ar2,D0Ar4
SUBLO D0Re0,D0Re0,#1
ADDHI D0Re0,D0Re0,#1
MOV PC,D1RtP
$Lhigh_differ:
! signed compare high words
SUBLT D0Re0,D0Re0,#1
ADDGT D0Re0,D0Re0,#1
MOV PC,D1RtP
.size ___cmpdi2,.-___cmpdi2
! Copyright 2007,2008 Imagination Technologies Ltd.
#include <asm/page.h>
.text
.global _copy_page
.type _copy_page,function
!! D1Ar1 - to
!! D0Ar2 - from
_copy_page:
MOV D0FrT,#PAGE_SIZE
$Lcopy_page_loop:
GETL D0Re0,D1Re0,[D0Ar2++]
GETL D0Ar6,D1Ar5,[D0Ar2++]
SETL [D1Ar1++],D0Re0,D1Re0
SETL [D1Ar1++],D0Ar6,D1Ar5
SUBS D0FrT,D0FrT,#16
BNZ $Lcopy_page_loop
MOV PC,D1RtP
.size _copy_page,.-_copy_page
/*
* Precise Delay Loops for Meta
*
* Copyright (C) 1993 Linus Torvalds
* Copyright (C) 1997 Martin Mares <mj@atrey.karlin.mff.cuni.cz>
* Copyright (C) 2007,2009 Imagination Technologies Ltd.
*
*/
#include <linux/export.h>
#include <linux/sched.h>
#include <linux/delay.h>
#include <asm/core_reg.h>
#include <asm/processor.h>
/*
* TXTACTCYC is only 24 bits, so on chips with fast clocks it will wrap
* many times per-second. If it does wrap __delay will return prematurely,
* but this is only likely with large delay values.
*
* We also can't implement read_current_timer() with TXTACTCYC due to
* this wrapping behaviour.
*/
#define rdtimer(t) t = __core_reg_get(TXTACTCYC)
void __delay(unsigned long loops)
{
unsigned long bclock, now;
rdtimer(bclock);
do {
asm("NOP");
rdtimer(now);
} while ((now-bclock) < loops);
}
EXPORT_SYMBOL(__delay);
inline void __const_udelay(unsigned long xloops)
{
u64 loops = (u64)xloops * (u64)loops_per_jiffy * HZ;
__delay(loops >> 32);
}
EXPORT_SYMBOL(__const_udelay);
void __udelay(unsigned long usecs)
{
__const_udelay(usecs * 0x000010c7); /* 2**32 / 1000000 (rounded up) */
}
EXPORT_SYMBOL(__udelay);
void __ndelay(unsigned long nsecs)
{
__const_udelay(nsecs * 0x00005); /* 2**32 / 1000000000 (rounded up) */
}
EXPORT_SYMBOL(__ndelay);
! Copyright (C) 2012 Imagination Technologies Ltd.
!
! Signed/unsigned 64-bit division routines.
!
.text
.global _div_u64
.type _div_u64,function
_div_u64:
$L1:
ORS A0.3,D1Ar3,D0Ar4
BNE $L3
$L2:
MOV D0Re0,D0Ar2
MOV D1Re0,D1Ar1
MOV PC,D1RtP
$L3:
CMP D1Ar3,D1Ar1
CMPEQ D0Ar4,D0Ar2
MOV D0Re0,#1
MOV D1Re0,#0
BHS $L6
$L4:
ADDS D0Ar6,D0Ar4,D0Ar4
ADD D1Ar5,D1Ar3,D1Ar3
ADDCS D1Ar5,D1Ar5,#1
CMP D1Ar5,D1Ar3
CMPEQ D0Ar6,D0Ar4
BLO $L6
$L5:
MOV D0Ar4,D0Ar6
MOV D1Ar3,D1Ar5
ADDS D0Re0,D0Re0,D0Re0
ADD D1Re0,D1Re0,D1Re0
ADDCS D1Re0,D1Re0,#1
CMP D1Ar3,D1Ar1
CMPEQ D0Ar4,D0Ar2
BLO $L4
$L6:
ORS A0.3,D1Re0,D0Re0
MOV D0Ar6,#0
MOV D1Ar5,D0Ar6
BEQ $L10
$L7:
CMP D1Ar1,D1Ar3
CMPEQ D0Ar2,D0Ar4
BLO $L9
$L8:
ADDS D0Ar6,D0Ar6,D0Re0
ADD D1Ar5,D1Ar5,D1Re0
ADDCS D1Ar5,D1Ar5,#1
SUBS D0Ar2,D0Ar2,D0Ar4
SUB D1Ar1,D1Ar1,D1Ar3
SUBCS D1Ar1,D1Ar1,#1
$L9:
LSL A0.3,D1Re0,#31
LSR D0Re0,D0Re0,#1
LSR D1Re0,D1Re0,#1
OR D0Re0,D0Re0,A0.3
LSL A0.3,D1Ar3,#31
LSR D0Ar4,D0Ar4,#1
LSR D1Ar3,D1Ar3,#1
OR D0Ar4,D0Ar4,A0.3
ORS A0.3,D1Re0,D0Re0
BNE $L7
$L10:
MOV D0Re0,D0Ar6
MOV D1Re0,D1Ar5
MOV PC,D1RtP
.size _div_u64,.-_div_u64
.text
.global _div_s64
.type _div_s64,function
_div_s64:
MSETL [A0StP],D0FrT,D0.5
XOR D0.5,D0Ar2,D0Ar4
XOR D1.5,D1Ar1,D1Ar3
TSTT D1Ar1,#HI(0x80000000)
BZ $L25
NEGS D0Ar2,D0Ar2
NEG D1Ar1,D1Ar1
SUBCS D1Ar1,D1Ar1,#1
$L25:
TSTT D1Ar3,#HI(0x80000000)
BZ $L27
NEGS D0Ar4,D0Ar4
NEG D1Ar3,D1Ar3
SUBCS D1Ar3,D1Ar3,#1
$L27:
CALLR D1RtP,_div_u64
TSTT D1.5,#HI(0x80000000)
BZ $L29
NEGS D0Re0,D0Re0
NEG D1Re0,D1Re0
SUBCS D1Re0,D1Re0,#1
$L29:
GETL D0FrT,D1RtP,[A0StP+#(-16)]
GETL D0.5,D1.5,[A0StP+#(-8)]
SUB A0StP,A0StP,#16
MOV PC,D1RtP
.size _div_s64,.-_div_s64
! Copyright (C) 2001, 2002, 2003, 2004, 2005, 2006, 2007
! Imagination Technologies Ltd
!
! Integer divide routines.
!
.text
.global ___udivsi3
.type ___udivsi3,function
.align 2
___udivsi3:
!!
!! Since core is signed divide case, just set control variable