Re: problems with memory allocation and the alignment check

From: Michael J. Baars
Date: Wed Feb 24 2021 - 02:07:44 EST


On Mon, 2021-02-22 at 01:41 -0800, Andrew Pinski wrote:
> On Mon, Feb 22, 2021 at 1:37 AM Michael J. Baars
> <mjbaars1977.gcc@xxxxxxxxxxxxx> wrote:
> > On Mon, 2021-02-22 at 01:29 -0800, Andrew Pinski wrote:
> > > On Mon, Feb 22, 2021 at 1:17 AM Michael J. Baars
> > > <mjbaars1977.gcc@xxxxxxxxxxxxx> wrote:
> > > > Hi,
> > > >
> > > > I just wrote this little program to demonstrate a possible flaw in both malloc and calloc.
> > > >
> > > > If I allocate a the simplest memory region from main(), one out of three optimization flags fail.
> > > > If I allocate the same region from a function, three out of three optimization flags fail.
> > > >
> > > > Does someone know if this really is a flaw, and if so, is it a gcc or a kernel flaw?
> > >
> > > There is no flaw. GCC (kernel, glibc) all assume unaligned accesses
> > > on x86 will not cause an exception.
> >
> > Is this just an assumption or more like a fact? I agree with you that byte aligned is more or less the same as unaligned.
>
> It is an assumption that is even made inside GCC. You can modify GCC
> not to assume that but you need to recompile all libraries and even
> check the assembly code that is included with most programs.
> Why are you enabling the alignment access check anyways? What are you
> trying to do?
> If you are looking into a performance issue with unaligned accesses,
> may I suggest you look into perf to see if you can see unaligned
> accesses?

Next to performance and correctness, I always try to keep in mind that every clock cycle will eventually end up on the energy bill, to avoid that computers cost
ten times more on the energy bill then they do in the store.

If you look at the power consumption of the Playstation 1 vs that of the Playstation 3 for example, you will see that the Playstation 1 uses (10 W / 240 V
= 0.041666667 A max, while the Playstation 3 consumes 240 V * 1.7 A = 408 W. More than 40 times as much energy!!!

Code and style always go hand in hand. Try to keep you code as sleek as possible and you will see that even an old computer can do a lot more than you ever
thought possible :)

Thanks,
Mischa.

> Thanks,
> Andrew
>
> > > Thanks,
> > > Andrew
> > >
> > > > Regards,
> > > > Mischa.
#include <stdint.h>

#include "compression.h"

uint8_t data_s[256] = {
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F,
0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2A, 0x2B, 0x2C, 0x2D, 0x2E, 0x2F, 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3A, 0x3B, 0x3C, 0x3D, 0x3E, 0x3F,
0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F, 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5A, 0x5B, 0x5C, 0x5D, 0x5E, 0x5F,
0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7A, 0x7B, 0x7C, 0x7D, 0x7E, 0x7F,
0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x8D, 0x8E, 0x8F, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9B, 0x9C, 0x9D, 0x9E, 0x9F,
0xA0, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xAA, 0xAB, 0xAC, 0xAD, 0xAE, 0xAF, 0xB0, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0xB6, 0xB7, 0xB8, 0xB9, 0xBA, 0xBB, 0xBC, 0xBD, 0xBE, 0xBF,
0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF, 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7, 0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF,
0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF, 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF
};
/*
0000000000000000 <compression_encode_prepare1>:
0: 48 89 f9 mov %rdi,%rcx
3: 31 d2 xor %edx,%edx
5: b8 00 00 00 01 mov $0x1000000,%eax
a: 66 0f 1f 44 00 00 nopw 0x0(%rax,%rax,1)
10: 48 83 e8 01 sub $0x1,%rax
14: 75 fa jne 10 <compression_encode_prepare1+0x10>
16: 88 11 mov %dl,(%rcx)
18: 48 83 c2 01 add $0x1,%rdx
1c: 48 83 c1 01 add $0x1,%rcx
20: 48 81 fa 00 01 00 00 cmp $0x100,%rdx
27: 75 dc jne 5 <compression_encode_prepare1+0x5>
29: c3 retq
2a: 66 0f 1f 44 00 00 nopw 0x0(%rax,%rax,1)
*/

void compression_encode_prepare1 (struct compression* c)
{
for (uint64_t j = 0; j < (1 << 24); j++)
for (uint64_t i = 0; i < 256; i++)
{
c->data_t[i] = i;
}
}

void compression_encode_prepare2 (struct compression* c)
{
for (uint64_t j = 0; j < (1 << 24); j++)
asm volatile \
( \
" lea %0 , %%rdi \n" \
" lea %1 , %%rsi \n" \
" mov $0x20, %%rcx \n" \
" rep movsq \n" \
: "=m" (c->data_t) \
: "m" ( data_s) \
: "%rcx", "%rsi", "%rdi" \
);
}

#ifndef __COMPRESSION_H__
#define __COMPRESSION_H__

#include <stdint.h>

struct compression
{
uint8_t data_t[256]; // compression tree indices
};

extern void compression_encode_prepare1 (struct compression* c);
extern void compression_encode_prepare2 (struct compression* c);

#endif

#include <stdint.h>
#include <stdio.h>
#include <time.h>

#include "compression.h"

int main()
{
clock_t tic, toc;

struct compression c;

tic = clock();

compression_encode_prepare1 (&c);

toc = clock();

for (uint64_t i = 0; i < 256; i++) printf("%02hhX ", c.data_t[i]); printf("\n");

printf("elapsed compression & encryption: %fs\n", (double) (toc - tic) / (double) CLOCKS_PER_SEC);

tic = clock();

compression_encode_prepare2 (&c);

toc = clock();

for (uint64_t i = 0; i < 256; i++) printf("%02hhX ", c.data_t[i]); printf("\n");

printf("elapsed compression & encryption: %fs\n", (double) (toc - tic) / (double) CLOCKS_PER_SEC);
}
all:

gcc -Ofast -c -g -o compression.o compression.c
gcc -Ofast -c -g -o main.o main.c
gcc -Ofast -g -o main main.o compression.o