QB64.org Forum

Active Forums => QB64 Discussion => Topic started by: SMcNeill on November 05, 2021, 09:10:54 am

Title: Somebody explain this one to me...
Post by: SMcNeill on November 05, 2021, 09:10:54 am
Code: QB64: [Select]
  1. Screen _NewImage(1024, 720, 32)
  2.  
  3.  
  4. TestLimit = 100
  5. Radius = 350
  6.  
  7. t# = Timer
  8. For i = 1 To TestLimit
  9.     'Cls , &HFF000000 + Int(Rnd * &HFFFFFF)
  10.     k& = &HFF000000 + Int(Rnd * &HFFFFFF)
  11.  
  12.     Circle (512, 360), Radius, k&
  13.     Fill 512, 360, k&, 0
  14.  
  15. t1# = Timer
  16. For i = 1 To TestLimit
  17.     'Cls , &HFF000000 + Int(Rnd * &HFFFFFF)
  18.     k& = &HFF000000 + Int(Rnd * &HFFFFFF)
  19.  
  20.     Circle (512, 360), Radius, k&
  21.     Fill 512, 360, k&, -1
  22. t2# = Timer
  23.  
  24. Print Using "###.#### seconds for Fill."; t1# - t#
  25. Print Using "###.#### seconds for Fill Method -1."; t2# - t1#
  26.  
  27.  
  28.  
  29.  
  30.  
  31. Sub Fill (Passx, Passy, kolor As Long, METHOD)
  32.     B = _Blend
  33.  
  34.     Dim TempGrid(0 To _Width - 1, 0 To _Height - 1) As Long
  35.     Dim As _MEM m, m1: m = _Mem(TempGrid()): m1 = _MemImage(0)
  36.     Dim o As _Offset
  37.     If METHOD Then
  38.         _MemCopy m1, m1.OFFSET, m1.SIZE To m, m.OFFSET
  39.     End If
  40.     o = 0
  41.     Do Until o >= m.SIZE
  42.         If _MemGet(m1, m1.OFFSET + o, _Unsigned Long) = kolor Then
  43.             _MemPut m, m.OFFSET + o, -1 As LONG
  44.             'Else
  45.             '_MemPut m, m.OFFSET + o, 0 As LONG
  46.         End If
  47.         o = o + 4
  48.     Loop
  49.  
  50.     TempGrid(Passx, Passy) = 1
  51.     startx = Passx: finishx = Passx
  52.     starty = Passy: finishy = Passy
  53.  
  54.     Do Until finished
  55.         pass = pass + 1
  56.         finished = -1
  57.         For x = startx To finishx
  58.             For y = starty To finishy
  59.                 'If TempGrid(x, y) <> 0 Then Print x, y, TempGrid(x, y): Sleep
  60.  
  61.                 If TempGrid(x, y) = pass Then
  62.                     tempx = x
  63.                     Do Until tempx = 0
  64.                         If TempGrid(tempx - 1, y) = 0 Then
  65.                             TempGrid(tempx - 1, y) = pass + 1
  66.                             tempx = tempx - 1
  67.                             If tempx < startx Then startx = tempx
  68.                             finished = 0
  69.                         Else
  70.                             tempx = 0
  71.                         End If
  72.                     Loop
  73.  
  74.                     tempx = x
  75.                     Do Until tempx = _Width - 1
  76.                         If TempGrid(tempx + 1, y) = 0 Then
  77.                             TempGrid(tempx + 1, y) = pass + 1
  78.                             tempx = tempx + 1
  79.                             If tempx > finishx Then finishx = tempx
  80.                             finished = 0
  81.                         Else
  82.                             tempx = _Width - 1
  83.                         End If
  84.                     Loop
  85.  
  86.                     tempy = y
  87.                     Do Until tempy = 0
  88.                         If TempGrid(x, tempy - 1) = 0 Then
  89.                             TempGrid(x, tempy - 1) = pass + 1
  90.                             tempy = tempy - 1
  91.                             If tempy < starty Then startx = tempy
  92.                             finished = 0
  93.                         Else
  94.                             tempy = 0
  95.                         End If
  96.                     Loop
  97.  
  98.                     tempy = y
  99.                     Do Until tempy = _Height - 1
  100.                         If TempGrid(x, tempy + 1) = 0 Then
  101.                             TempGrid(x, tempy + 1) = pass + 1
  102.                             tempy = tempy + 1
  103.                             If tempy > finishy Then finishy = tempy
  104.                             finished = 0
  105.                         Else
  106.                             tempy = _Height - 1
  107.                         End If
  108.                     Loop
  109.                 End If
  110.         Next y, x
  111.     Loop
  112.  
  113.     o = 0
  114.     Do Until o >= m.SIZE
  115.         If _MemGet(m, m.OFFSET + o, _Unsigned Long) <> 0 Then _MemPut m1, m1.OFFSET + o, kolor As _UNSIGNED LONG
  116.         o = o + 4
  117.     Loop
  118.     _MemFree m: _MemFree m1
  119.     If B Then _Blend

  [ This attachment cannot be displayed inline in 'Print Page' view ]  

Now, if you guys look, there's only one difference in the code:

    If METHOD Then
        _MemCopy m1, m1.OFFSET, m1.SIZE To m, m.OFFSET
    End If

The main process here works like this:

First, we make an array the size of the screen.
Then where the color exists that we want to use as a paint boundary, we assign a value of -1 to the grid in that position.
Then we do a little pathfinding style routine counting passes to fill in from the point we choose to the end of those boundaries...

Both routines work flawlessly.  Neither has any issues doing what they're supposed to do. 

In one (where we pass a method value of 0), the array has starts out empty with values of in it, and we assign a value of -1 to any point where the chosen color is... 

In the other (where we pass a method value of anything but 0), we copy the whole array of screen colors over, and assign a value of -1 to any point where the chosen color is..  (Note, this can only work as long as our screen doesn't have any bright white characters printed on it, as it doesn't have in these instances here.)

Now, logically speaking, it would seem that one method would, indeed, be faster than the other.  Wouldn't it be faster to go with a blank array, rather than first having to copy and alter every pixel from the screen onto the array?  You'd think so...

...and, as the screenshot above illustrates, YOU'D BE WRONG!!

It's faster to copy the whole screen into our array -- a LOT faster -- than it is to NOT copy the screen.  WTH?!!  WHY??

Why is it faster to do MORE work than it is to SKIP that work???

I find the differences in the speed runs here to be mind boggling, and I can't sort out why the heck it's faster to do more work than it is to NOT do it.

Anyone want to take a shot at explaining why this has such different speed results with it?  I'm completely baffled here.
Title: Re: Somebody explain this one to me...
Post by: Petr on November 05, 2021, 09:48:40 am
It does not make any sense. The only thing I can think of, but I'm really guessing, is the organization of bytes in memory and in the field. Is it the same or is it the other way around?
Title: Re: Somebody explain this one to me...
Post by: SMcNeill on November 05, 2021, 09:57:33 am
It does not make any sense. The only thing I can think of, but I'm really guessing, is the organization of bytes in memory and in the field. Is it the same or is it the other way around?

It doesn't make any sense to me either. 

The first has a blank array the size of our screen width and height.

The second is that exact same array, with the screen values copied over.

Then we set the boundary color to -1 and process that array in the exact same manner for either of those two methods....

The second is doing more work than the first, adding in an extra, unneeded step -- but it's finishing up 10x faster than the original!

It makes no sense to me.  I'm utterly baffled by these results.
Title: Re: Somebody explain this one to me...
Post by: Petr on November 05, 2021, 10:16:38 am
Wait. First run is WITH METHOD = 0 so MEMCOPY on row 46 is not used and duration (here) is 15 seconds. Second run use MEMCOPY and duration is 0.6 second here. If is MEMCOPY commented, then both run durations are 15 seconds on my computer.


Code: QB64: [Select]
  1. Screen _NewImage(1024, 720, 32)
  2.  
  3.  
  4. TestLimit = 100
  5. Radius = 350
  6.  
  7. t# = Timer
  8. For i = 1 To TestLimit
  9.     'Cls , &HFF000000 + Int(Rnd * &HFFFFFF)
  10.     k& = &HFF000000 + Int(Rnd * &HFFFFFF)
  11.  
  12.     Circle (512, 360), Radius, k&
  13.     Fill 512, 360, k&, 0
  14.  
  15. t1# = Timer
  16. For i = 1 To TestLimit
  17.     'Cls , &HFF000000 + Int(Rnd * &HFFFFFF)
  18.     k& = &HFF000000 + Int(Rnd * &HFFFFFF)
  19.  
  20.     Circle (512, 360), Radius, k&
  21.     Fill 512, 360, k&, -1
  22. t2# = Timer
  23.  
  24. Print Using "###.#### seconds for Fill."; t1# - t#
  25. Print Using "###.#### seconds for Fill Method -1."; t2# - t1#
  26.  
  27.  
  28.  
  29.  
  30.  
  31. Sub Fill (Passx, Passy, kolor As _Unsigned Long, METHOD)
  32.     B = _Blend
  33.     '_DontBlend
  34.  
  35.     ReDim TempGrid(0 To _Width - 1, 0 To _Height - 1) As _Unsigned Long
  36.     Dim As _MEM m, m1: m = _Mem(TempGrid()): m1 = _MemImage(0)
  37.     Dim o As _Offset
  38.  
  39.     If METHOD Then 'this is second run
  40.         _MemCopy m1, m1.OFFSET, m1.SIZE To m, m.OFFSET
  41.     End If
  42.     o = 0
  43.  
  44.     If METHOD = 0 Then 'this is first run
  45.         Do Until o >= m.SIZE
  46.             If _MemGet(m1, m1.OFFSET + o, _Unsigned Long) = kolor Then
  47.                 _MemPut m, m.OFFSET + o, -1 As _UNSIGNED LONG
  48.                 'Else
  49.                 '_MemPut m, m.OFFSET + o, 0 As LONG
  50.             End If
  51.             o = o + 4
  52.         Loop
  53.     End If
  54.  
  55.     TempGrid(Passx, Passy) = 1
  56.     startx = Passx: finishx = Passx
  57.     starty = Passy: finishy = Passy
  58.  
  59.     Do Until finished
  60.         pass = pass + 1
  61.         finished = -1
  62.         For x = startx To finishx
  63.             For y = starty To finishy
  64.                 'If TempGrid(x, y) <> 0 Then Print x, y, TempGrid(x, y): Sleep
  65.  
  66.                 If TempGrid(x, y) = pass Then
  67.                     tempx = x
  68.                     Do Until tempx = 0
  69.                         If TempGrid(tempx - 1, y) = 0 Then
  70.                             TempGrid(tempx - 1, y) = pass + 1
  71.                             tempx = tempx - 1
  72.                             If tempx < startx Then startx = tempx
  73.                             finished = 0
  74.                         Else
  75.                             tempx = 0
  76.                         End If
  77.                     Loop
  78.  
  79.                     tempx = x
  80.                     Do Until tempx = _Width - 1
  81.                         If TempGrid(tempx + 1, y) = 0 Then
  82.                             TempGrid(tempx + 1, y) = pass + 1
  83.                             tempx = tempx + 1
  84.                             If tempx > finishx Then finishx = tempx
  85.                             finished = 0
  86.                         Else
  87.                             tempx = _Width - 1
  88.                         End If
  89.                     Loop
  90.  
  91.                     tempy = y
  92.                     Do Until tempy = 0
  93.                         If TempGrid(x, tempy - 1) = 0 Then
  94.                             TempGrid(x, tempy - 1) = pass + 1
  95.                             tempy = tempy - 1
  96.                             If tempy < starty Then startx = tempy
  97.                             finished = 0
  98.                         Else
  99.                             tempy = 0
  100.                         End If
  101.                     Loop
  102.  
  103.                     tempy = y
  104.                     Do Until tempy = _Height - 1
  105.                         If TempGrid(x, tempy + 1) = 0 Then
  106.                             TempGrid(x, tempy + 1) = pass + 1
  107.                             tempy = tempy + 1
  108.                             If tempy > finishy Then finishy = tempy
  109.                             finished = 0
  110.                         Else
  111.                             tempy = _Height - 1
  112.                         End If
  113.                     Loop
  114.                 End If
  115.         Next y, x
  116.     Loop
  117.  
  118.     o = 0
  119.     Do Until o >= m.SIZE
  120.         If _MemGet(m, m.OFFSET + o, _Unsigned Long) <> 0 Then _MemPut m1, m1.OFFSET + o, kolor As _UNSIGNED LONG
  121.         o = o + 4
  122.     Loop
  123.     _MemFree m: _MemFree m1
  124.     Erase TempGrid
  125.     If B Then _Blend
  126.  
  127.  

So - it is ok, or not?
Title: Re: Somebody explain this one to me...
Post by: SMcNeill on November 05, 2021, 10:36:09 am
That's the question: Why is the _MEMCOPY speeding it up so much?  It's a completely unnecessary step and is just extra processing being done...  and yet it's sooo much faster than not using it??

How can we do more work and yet end up with faster times?  Much faster times to boot?

Note -- unlike your change below Petr, the original is running that process in both runs:
 
    If METHOD = 0 Then 'this is first run
        Do Until o >= m.SIZE
            If _MemGet(m1, m1.OFFSET + o, _Unsigned Long) = kolor Then
                _MemPut m, m.OFFSET + o, -1 As _UNSIGNED LONG
                'Else
                '_MemPut m, m.OFFSET + o, 0 As LONG
            End If
            o = o + 4
        Loop
    End If


So how is it running it faster with _MEMCOPY in play?
Title: Re: Somebody explain this one to me...
Post by: Cobalt on November 05, 2021, 10:44:52 am
something similar to how LINE()-(),,BF is faster than just plain LINE()-(), ?

by the way _SCREENMOVE _MIDDLE  is not working on my machine, unless its supposed to put the window about half off the screen.
Title: Re: Somebody explain this one to me...
Post by: SMcNeill on November 05, 2021, 10:57:14 am
something similar to how LINE()-(),,BF is faster than just plain LINE()-(), ?

by the way _SCREENMOVE _MIDDLE  is not working on my machine, unless its supposed to put the window about half off the screen.

Add a huge ass delay in front of it.  0.25 used to work for me, but now I have to pump it up to about 2 seconds before it works right.

Or use this: https://www.qb64.org/forum/index.php?topic=3659.msg130063#msg130063
Title: Re: Somebody explain this one to me...
Post by: Petr on November 05, 2021, 11:00:03 am
Probably because MEMCOPY does not compare anything, it just takes a block of memory and copies it elsewhere, while the second method compares all the values with each other, which is unnecessary because the next part of the program selects it anyway. There is no better explanation. Probably too many comparisons and the length of the loop slows it down, in the slower method it is 786,432 comparisons 40 times in a row, so that's 31,457,280 comparisons in 15 seconds, that means 2,097,152 comparisons of values per second, it seems to me really enough.
Title: Re: Somebody explain this one to me...
Post by: Cobalt on November 05, 2021, 11:04:44 am
Add a huge ass delay in front of it.  0.25 used to work for me, but now I have to pump it up to about 2 seconds before it works right.

Or use this: https://www.qb64.org/forum/index.php?topic=3659.msg130063#msg130063

oh so _MIDDLE still needs a delay? _SCREENMOVE 5,5 works fine without, so I thought all the _SCREENMOVEs were fixed.

anyway,

Why is the second run always faster than the first? if you rem out the IF THEN so the MEMCOPY always runs the second time is always faster than the first by a fair amount. What would cause that as well?
Title: Re: Somebody explain this one to me...
Post by: SMcNeill on November 05, 2021, 11:33:12 am
Why is the second run always faster than the first? if you rem out the IF THEN so the MEMCOPY always runs the second time is always faster than the first by a fair amount. What would cause that as well?

That's the question I've been having, and why I posted this topic.  :P

I'm beginning to think that QB64 may not be reserving memory when we DIM an array, but is instead using memory as we place information into an array.

Try this as an example:

Code: QB64: [Select]
  1. Dim a(1000000000) As String * 1
  2. Print "1 GB of memory used in main program"
  3. Pause
  4. foo
  5. foo2
  6.  
  7.  
  8. Sub foo
  9.     Dim a(1000000000) As String * 1
  10.     Print "1 GB of memory used in SUB foo"
  11.     Pause
  12.  
  13. Sub foo2
  14.     Dim a(1000000000) As String * 1
  15.     Dim m As _MEM: m = _Mem(a())
  16.     _MemFill m, m.OFFSET, m.SIZE, 65 As _UNSIGNED _BYTE
  17.     Print a(1000000000)
  18.     Print "1 GB of memory used in SUB foo2"
  19.     Pause
  20.  
  21.  
  22.  
  23.  
  24. Sub Pause
  25.     Print "Press <SPACE> to continue.  Check your mem usage in Task Manager as you go."
  26.     _KeyClear
  27.     Do
  28.         _Limit 10
  29.     Loop Until _KeyHit = 32

Now, the above expects your machine to have 2 GB of available ram to run it.  If you don't, or if you're compiling on a 32-bit version of QB64, drop a zero from each of those arrays...

When you first run it, you use 1GB of memory for your program.
Hit space, and the second array inside the SUB *should* use another GB of memory.   **IT DOESN'T.**
Hit space, and the third array inside the last SUB dims the array AND puts something in it...  It uses an extra GB of memory.

Arrays dimmed in our main program seem to reserve memory for their usage.  Arrays dimmed in SUB/FUNCTIONs don't seem to do this -- they allocate memory as the array gets used and filled.

For my program, I think the difference in speed is coming from this allocation of "memory on the fly".  With the _MEMCOPY, we fill the array all at once, without it, the routine is filling the array a single element at a time and is constantly resizing and moving itself in memory as needed...

_MEMCOPY might be an overall unnecessary step, but just the fact that it's allocating the space and holding it in memory all at once seems to be what's making the noticeable difference in speed here.
Title: Re: Somebody explain this one to me...
Post by: SMcNeill on November 05, 2021, 11:54:15 am
That's *exactly* what's happening here!

Code: QB64: [Select]
  1. Dim a(1000000000) As String * 1
  2. Print "1 GB of memory used in main program"
  3. Pause
  4. foo
  5.  
  6.  
  7. Sub foo
  8.     Dim a(1000000000) As String * 1
  9.     Dim m As _MEM: m = _Mem(a())
  10.     Print "1 GB of memory used in SUB foo"
  11.     Pause
  12.     For i = 1 To 1000000000 Step 100000000
  13.         _MemFill m, m.OFFSET, i, 65 As _UNSIGNED _BYTE
  14.         Print "Inside the SUB, with "; i; " in use."
  15.         Pause
  16.     Next
  17.  
  18. Sub Pause
  19.     Print "Press <SPACE> to continue.  Check your mem usage in Task Manager as you go."
  20.     _KeyClear
  21.     Do
  22.         _Limit 10
  23.     Loop Until _KeyHit = 32

Somehow, arrays are allocated differently in memory if they're inside a sub or in the main module.  Arrays used in the main module reserve space for themselves in memory as we DIM them, whereas arrays inside a SUB are constantly resizing and growing to fit how much information we've actually put in them. 

The _MEMCOPY in this case is filling the array completely all in one go, while without it, the array is constantly regrowing, resizing, and repositioning itself in memory. 

Who knew?!!

When I need speed from now on, I may need to create a static dummy array for a program and then simply _MEMCOPY it over all at once to fix this type problem inside a SUB/FUNCTION.  :P
Title: Re: Somebody explain this one to me...
Post by: SMcNeill on November 05, 2021, 12:34:41 pm
Here's another quick test for you guys:

    Dim TempGrid(0 To _Width - 1, 0 To _Height - 1) As Long

Move that DIM statement from the SUB and place it at the top of your code as a DIM SHARED statement, right after the SCREEN statement.  See the difference in the timed results then!

Title: Re: Somebody explain this one to me...
Post by: bplus on November 05, 2021, 01:08:24 pm
I changed a few things and Method -1 still fast but doesn't look right:
Code: QB64: [Select]
  1. Screen _NewImage(1024, 700, 32)
  2. _ScreenMove 150, 40
  3.  
  4. TestLimit = 1
  5.  
  6. t# = Timer
  7. For i = 1 To TestLimit
  8.     For Radius = 340 To 0 Step -1
  9.         k& = &HFF000000 + Int(Rnd * &HFFFFFF)
  10.         Circle (512, 350), Radius, k&
  11.         Fill 512, 350, k&, -1
  12.     Next
  13.  
  14. t1# = Timer
  15. For i = 1 To TestLimit
  16.     For Radius = 340 To 0 Step -1
  17.         k& = &HFF000000 + Int(Rnd * &HFFFFFF)
  18.         Circle (512, 350), Radius, k&
  19.         Fill 512, 350, k&, 0
  20.     Next
  21. t2# = Timer
  22. Print Using "###.#### seconds for Fill."; t2# - t1#
  23. Print Using "###.#### seconds for Fill Method -1."; t1# - t#
  24.  
  25.  
  26.  
  27.  
  28.  
  29. Sub Fill (Passx, Passy, kolor As Long, METHOD)
  30.     B = _Blend
  31.  
  32.     Dim TempGrid(0 To _Width - 1, 0 To _Height - 1) As Long
  33.     Dim As _MEM m, m1: m = _Mem(TempGrid()): m1 = _MemImage(0)
  34.     Dim o As _Offset
  35.     If METHOD Then
  36.         _MemCopy m1, m1.OFFSET, m1.SIZE To m, m.OFFSET
  37.     End If
  38.     o = 0
  39.     Do Until o >= m.SIZE
  40.         If _MemGet(m1, m1.OFFSET + o, _Unsigned Long) = kolor Then
  41.             _MemPut m, m.OFFSET + o, -1 As LONG
  42.             'Else
  43.             '_MemPut m, m.OFFSET + o, 0 As LONG
  44.         End If
  45.         o = o + 4
  46.     Loop
  47.  
  48.     TempGrid(Passx, Passy) = 1
  49.     startx = Passx: finishx = Passx
  50.     starty = Passy: finishy = Passy
  51.  
  52.     Do Until finished
  53.         pass = pass + 1
  54.         finished = -1
  55.         For x = startx To finishx
  56.             For y = starty To finishy
  57.                 'If TempGrid(x, y) <> 0 Then Print x, y, TempGrid(x, y): Sleep
  58.  
  59.                 If TempGrid(x, y) = pass Then
  60.                     tempx = x
  61.                     Do Until tempx = 0
  62.                         If TempGrid(tempx - 1, y) = 0 Then
  63.                             TempGrid(tempx - 1, y) = pass + 1
  64.                             tempx = tempx - 1
  65.                             If tempx < startx Then startx = tempx
  66.                             finished = 0
  67.                         Else
  68.                             tempx = 0
  69.                         End If
  70.                     Loop
  71.  
  72.                     tempx = x
  73.                     Do Until tempx = _Width - 1
  74.                         If TempGrid(tempx + 1, y) = 0 Then
  75.                             TempGrid(tempx + 1, y) = pass + 1
  76.                             tempx = tempx + 1
  77.                             If tempx > finishx Then finishx = tempx
  78.                             finished = 0
  79.                         Else
  80.                             tempx = _Width - 1
  81.                         End If
  82.                     Loop
  83.  
  84.                     tempy = y
  85.                     Do Until tempy = 0
  86.                         If TempGrid(x, tempy - 1) = 0 Then
  87.                             TempGrid(x, tempy - 1) = pass + 1
  88.                             tempy = tempy - 1
  89.                             If tempy < starty Then startx = tempy
  90.                             finished = 0
  91.                         Else
  92.                             tempy = 0
  93.                         End If
  94.                     Loop
  95.  
  96.                     tempy = y
  97.                     Do Until tempy = _Height - 1
  98.                         If TempGrid(x, tempy + 1) = 0 Then
  99.                             TempGrid(x, tempy + 1) = pass + 1
  100.                             tempy = tempy + 1
  101.                             If tempy > finishy Then finishy = tempy
  102.                             finished = 0
  103.                         Else
  104.                             tempy = _Height - 1
  105.                         End If
  106.                     Loop
  107.                 End If
  108.         Next y, x
  109.     Loop
  110.  
  111.     o = 0
  112.     Do Until o >= m.SIZE
  113.         If _MemGet(m, m.OFFSET + o, _Unsigned Long) <> 0 Then _MemPut m1, m1.OFFSET + o, kolor As _UNSIGNED LONG
  114.         o = o + 4
  115.     Loop
  116.     _MemFree m: _MemFree m1
  117.     If B Then _Blend
  118.  
  119.  
Title: Re: Somebody explain this one to me...
Post by: Cobalt on November 05, 2021, 01:44:12 pm
That's the question I've been having, and why I posted this topic.  :P

I'm beginning to think that QB64 may not be reserving memory when we DIM an array, but is instead using memory as we place information into an array.

no I mean if you make the 2 loops the same (method -1 on both) so they both use the _MEMCOPY, the second timed loop is still faster than the first by .15 to .25 seconds faster.
Title: Re: Somebody explain this one to me...
Post by: Petr on November 06, 2021, 04:41:54 am
Quote
Here's another quick test for you guys:

    Dim TempGrid(0 To _Width - 1, 0 To _Height - 1) As Long

Move that DIM statement from the SUB and place it at the top of your code as a DIM SHARED statement, right after the SCREEN statement.  See the difference in the timed results then!

@SMcNeill

Hi, here is my results after replacing DIM
0.7 seconds (before 15 seconds)
0.6 seconds (the same as before)

Let me express my deep respect for you, because I would not really look for the problem in this. This is an absolutely crucial finding. I am very glad that you have alerted us to this matter. Great job!
Title: Re: Somebody explain this one to me...
Post by: SMcNeill on November 06, 2021, 03:18:09 pm
@SMcNeill

Hi, here is my results after replacing DIM
0.7 seconds (before 15 seconds)
0.6 seconds (the same as before)

Let me express my deep respect for you, because I would not really look for the problem in this. This is an absolutely crucial finding. I am very glad that you have alerted us to this matter. Great job!

The code as written is kind of buggy (it only works with a clear background), but it does a good job illustrating how arrays work in subs/functions.  In the main part of the program, they reserve memory when we DIM them.  In SUB/FUNCTION they don't -- the memory is allocated dynamically as needed.  This difference can produce quite a speed/performance difference, so there may be times when it's better to DIM SHARED an array and let it stay in memory than it is to DIM it and free it repeatedly in a SUB.

It's a shame that STATIC won't work with an array in a sub.  At least, I can't get it to work.  When I try I just get a crash to desktop. 

Just something to keep in the back of the mind from now on for speed optimizations.
Title: Re: Somebody explain this one to me...
Post by: Richard on December 05, 2021, 01:20:09 am
@SMcNeill





I offer the following explanation as to how it is possible that by addition of one extra line of code, significant reduction in timing results rather than expected an increase.


This reply assumes you understand the background information as per the link below



https://www.qb64.org/forum/index.php?topic=2276.msg138538#msg138538 (https://www.qb64.org/forum/index.php?topic=2276.msg138538#msg138538)
Reply 184



Firstly, very roughly when determining the size of your code (i.e. a program containing no code (untitled.exe) subtracted from your program.exe) - your actual running code (.exe) is about 10.2 Kbytes (I get 2094080 -2083840 =~10.2 Kbytes) . If your computer (which runs almost exactly 2x faster than mine) is of similar/related architecture (eg INTEL >= i7) - then the 10K of .exe code is a candidate for L1.

On my laptop, L1 is 32K - but from my observations Windows background processes are using all processors (4 physical, 8 logical) even when CPU useage (Task Manager) is only say 30%. So at any time, a fairly large percentage of L1 is being "hogged" by Windows and you would be lucky to have any where near 10 Kbytes of "your" code in L1 (similar considerations for L2 and L3).


When, for a clearer understanding, I reduce your program (supplied below) to minimal requirements, the active code (.exe) is (2092032 - 2083840) =~ 8.2 Kbytes. To me, although the difference (10.2 - 8.2 Kb) may not seem much, in relative terms for L1 location (which is being "butchered" in useage by Windows background processes) - it is apparent that even a relatively slight change in .exe code can yield wildly differing timing results (because of Windows).


So your.exe (heavily referred part = 10.2 Kbytes) is only partly residing in L1 (similarly for L2 and L3) at any instance of time. When the _MemCopy code is acted on - some (_WIDTH * _HEIGHT  =~ 730,000) memory accesses are efficiently at machine level being processed, which is a very large number, which means L1 (after a setup time) is preloaded with the relevant _MemCopy machine Code. Also probably the _MemCopy is relatively small (say << 1K bytes) and has a high chance of 'displacing' not-so-much utilized Windows background processes that are "lying around still in L1". By virtue of the fact that _MemCopy machine code is "resident" in L1, any future references to m and m1 _MEM BLOCKS  (i.e. the rest of your program) would benefit from MEM code being in L1. In fact, with your test limit value of 100, some 73 million MEM operations (over about 10 seconds) would have speed improvement simply because the fact that _MEMCOPY was utilized once. HAD all of your 10.2 Kbyte of code been resident in L1 then POSSIBLY there may not be any significant between (with _MEMCOPY) versus (without _MEMCOPY). I do not know how to measure what is L1, L2, L3 at any time but as far as L1 goes it seems very apparent that windows is very aggressive in "hogging" L1, leaving much less than 10K bytes for anything else, and ONLY when a highly repetitive "small" amount of code (_MemCopy), in this case >730,000 iterations, is when some of L1 being "hogged" by windows is released for other applications (your program). In the first running part of your program (before _MemCopy), it is apparent that the relevant code is "too large" for what is immediately free in L1 and even a billion iterations of that code set may not "displace Windows background processes". In conclusion, _MEMCOPY (machine code small) performing 730000 iterations of memory referencing, is the winner.


Now it is interesting when running the code below, which is a drastically reduced version of your program (and there is NOTHING ACTUALLY TO SEE WHEN RUNNING), that the timing differences (with _MEMCOPY) versus (without _MEMCOPY) are relatively minor and may not even warrant discussion. But now we are taking about a smaller program (8.2 Kbyte) which has a much higher chance of being "resident" in L1 than the 10.2 Kbyte (your program). Now if you ran this shortened program many times, AND also with varying applications installed/removed (eg other instances of QB64, EDGE, etc) - then you will get "widely varied results" including sometimes (with _MEMCOPY) taking LONGER than (without _MEMCOPY) and relative timing ratios approaching 3:1. It appears that the "erratic behaviour" (i.e. non-consistent) of Windows background processes is the reasoning of this.


As a side note, referring to the reply #184 mentioned above, it would appear (without proper study/analysis) that anything running in L1 is approaching being 10x faster than the first time code is being performed. One has to wait for the setup times for preloading L1 before taking advantage of this.



Code: QB64: [Select]
  1. TestLimit = 100
  2. Dim TempGrid(0 To _Width - 1, 0 To _Height - 1) As Long
  3. dim TempGridn(0 To _Width - 1, 0 To _Height - 1) As Long
  4. Dim As _MEM m, m1: m = _Mem(TempGrid()): m1 = _MemImage(0)
  5. Dim As _MEM n, n1: n = _Mem(TempGridn()): n1 = _MemImage(0)
  6. Dim o As _Offset:o = 0
  7.  
  8. tb#=timer
  9.  
  10. for trials%=1 to TestLimit
  11.     o = 0
  12.     Do Until o >= m.SIZE
  13.         If _MemGet(m, m.OFFSET + o, _Unsigned Long) <> 0 Then _MemPut m1, m1.OFFSET + o, kolor As _UNSIGNED LONG
  14.         o = o + 4
  15.     Loop
  16.  
  17. t0#=timer:
  18.  
  19. _MemCopy n1, n1.OFFSET, n1.SIZE To n, n.OFFSET '*** this _MemCopy line added
  20. for trials%=1 to TestLimit
  21.     o = 0
  22.     Do Until o >= n.SIZE
  23.         If _MemGet(n, n.OFFSET + o, _Unsigned Long) <> 0 Then _MemPut n1, n1.OFFSET + o, kolor As _UNSIGNED LONG
  24.         o = o + 4
  25.     Loop
  26.  
  27. t2#=timer
  28.  
  29. Print  "(without   _MemCopy)"; t0# - tb#
  30. Print  "(including _MemCopy)"; t2# - t0#:print
  31. print "# of MEM references GRAND TOTAL =";
  32. print using "###,###,###";_WIDTH*_HEIGHT*Testlimit;
  33.  
  34.  
  35.  
  36.  
Title: Re: Somebody explain this one to me...
Post by: TempodiBasic on December 05, 2021, 12:16:41 pm
Wow you're following the path of crucial difference in results of translator into C++.
No  bad BASIC code but bad final  C++ code.
The issue moves from coder's ability  to ability of translator into C++.
Title: Re: Somebody explain this one to me...
Post by: SMcNeill on December 05, 2021, 03:08:17 pm
The issue for us isn't one to do with L1 or L2 memory.  It has to do with how QB64 allocates memory

DIM foo(1000), if used in the main module, creates and reserves space in memory foryour 1001 elements from the moment you DIM it.

DIM foo(1000), if used in a SUB or FUNCTION, doesn't create or reserve any space, until you put values into each element.

And the difference in these two behaviors is one which affects performance quite a bit, if called repeatedly.  It's this difference that creates our difference in speed and performance. 

And, I suppose it makes sense.  DIM, in the main module, creates a permanent array.  We never free it, unless we do so manually with ERASE or CLEAR, or whatnot.  All variables and such, however, end up being temporary when created for a SUB/FUNCTION, unless we specifically declare it as STATIC.

It's the STATIC vs DYNAMIC creation of memory which is the difference in speed in the xamples here..
Title: Re: Somebody explain this one to me...
Post by: Richard on January 04, 2022, 08:00:45 am
@SMcNeill


I understand your reply regarding differences in memory allocation for arrays (Main_Program)  versus  (SUBS) and how this affects timings in using these techniques for array work.


I attempted to time the big DIM array


Dim TempGrid(0 To _Width - 1, 0 To _Height - 1) As Long


for the two "passes"  of your circle program and did not see any significant changes (minor ones exist) between your two methods as far as DIM being setup.

I attempted to slightly modify your program (my alterations shown with "*** at the end of each relevant line) and is supplied below.

Perhaps you are correct - in which case there is a a coding problem in my alterations to your program.
Would you be kind enough to point out any coding problems in my alterations to establish correct timing results to support your last reply?



Code: QB64: [Select]
  1.  
  2. DECLARE DYNAMIC LIBRARY "kernel32"                                             '***
  3.     SUB QueryPerformanceCounter (BYVAL lpPerformanceCount AS _UNSIGNED _OFFSET)'***
  4.     SUB QueryPerformanceFrequency (BYVAL lpFrequency AS _UNSIGNED _OFFSET)     '***
  5. END DECLARE                                                                    '***
  6. DIM SHARED PerformanceCount AS _INTEGER64                                      '***
  7. DIM SHARED Frequency AS _INTEGER64                                             '***
  8. DIM SHARED StartingCount AS _INTEGER64                                         '***
  9. DIM SHARED dElapsedTime AS DOUBLE                                              '***
  10. DIM SHARED dElapsedTimeOLD AS DOUBLE                                           '***
  11. DIM SHARED CumulativeTime AS DOUBLE                                            '***
  12. QueryPerformanceFrequency (_OFFSET(Frequency))                                 '***
  13. QueryPerformanceCounter (_OFFSET(StartingCount))                               '***
  14. CumulativeTime#=0                                                              '***
  15.  
  16.  
  17. Screen _NewImage(1024, 720, 32)
  18.  
  19.  
  20. TestLimit = 100
  21. Radius = 350
  22.  
  23. t# = Timer
  24. For i = 1 To TestLimit
  25.     'Cls , &HFF000000 + Int(Rnd * &HFFFFFF)
  26.     k& = &HFF000000 + Int(Rnd * &HFFFFFF)
  27.  
  28.     Circle (512, 360), Radius, k&
  29.     Fill 512, 360, k&, 0
  30.  
  31. t1# = Timer
  32. CumulativeTime_T1_T#=CumulativeTime#                                           '***
  33. CumulativeTime#=0                                                              '***
  34.  
  35. For i = 1 To TestLimit
  36.     'Cls , &HFF000000 + Int(Rnd * &HFFFFFF)
  37.     k& = &HFF000000 + Int(Rnd * &HFFFFFF)
  38.  
  39.     Circle (512, 360), Radius, k&
  40.     Fill 512, 360, k&, -1
  41. t2# = Timer
  42. CumulativeTime_T2_T1#=CumulativeTime#                                          '***
  43.  
  44. Print Using "###.#### seconds for Fill."; t1# - t#;
  45. COLOR &HFF00FF00~&                                                             '***
  46. print "             ";                                                        '***
  47. print using "#,###";CumulativeTime_T1_T#;                                      '***
  48. print " microseconds to DIM array"                                            '***
  49. COLOR &HFFFFFFFF~&
  50.  
  51.  
  52. Print Using "###.#### seconds for Fill Method -1."; t2# - t1#;
  53. COLOR &HFF00FF00~&                                                             '***
  54. print "   ";                                                                  '***
  55. print using "#,###";CumulativeTime_T2_T1#;                                     '***
  56. print " microseconds to DIM array"                                            '***
  57. COLOR &HFFFFFFFF~&                                                             '***
  58.  
  59.  
  60.  
  61.  
  62. Sub Fill (Passx, Passy, kolor As Long, METHOD)
  63. B = _Blend
  64.  
  65.  
  66. TIMERe:dElapsedTimeOLD = dElapsedTime                                          '***
  67. Dim TempGrid(0 To _Width - 1, 0 To _Height - 1) As Long
  68. TIMERe                                                                         '***
  69. CumulativeTime#=CumulativeTime#+(dElapsedTime - dElapsedTimeOLD)*1000000       '***
  70.  
  71.  
  72. Dim As _MEM m, m1: m = _Mem(TempGrid()): m1 = _MemImage(0)
  73. If METHOD Then
  74.     _MemCopy m1, m1.OFFSET, m1.SIZE To m, m.OFFSET
  75. o = 0
  76. Do Until o >= m.SIZE
  77.     If _MemGet(m1, m1.OFFSET + o, _Unsigned Long) = kolor Then
  78.         _MemPut m, m.OFFSET + o, -1 As LONG
  79.         'Else
  80.         '_MemPut m, m.OFFSET + o, 0 As LONG
  81.     End If
  82.     o = o + 4
  83.  
  84. TempGrid(Passx, Passy) = 1
  85. startx = Passx: finishx = Passx
  86. starty = Passy: finishy = Passy
  87.  
  88. Do Until finished
  89.     pass = pass + 1
  90.     finished = -1
  91.     For x = startx To finishx
  92.         For y = starty To finishy
  93.             'If TempGrid(x, y) <> 0 Then Print x, y, TempGrid(x, y): Sleep
  94.  
  95.             If TempGrid(x, y) = pass Then
  96.                 tempx = x
  97.                 Do Until tempx = 0
  98.                     If TempGrid(tempx - 1, y) = 0 Then
  99.                         TempGrid(tempx - 1, y) = pass + 1
  100.                         tempx = tempx - 1
  101.                         If tempx < startx Then startx = tempx
  102.                         finished = 0
  103.                     Else
  104.                         tempx = 0
  105.                     End If
  106.                 Loop
  107.  
  108.                 tempx = x
  109.                 Do Until tempx = _Width - 1
  110.                     If TempGrid(tempx + 1, y) = 0 Then
  111.                         TempGrid(tempx + 1, y) = pass + 1
  112.                         tempx = tempx + 1
  113.                         If tempx > finishx Then finishx = tempx
  114.                         finished = 0
  115.                     Else
  116.                         tempx = _Width - 1
  117.                     End If
  118.                 Loop
  119.  
  120.                 tempy = y
  121.                 Do Until tempy = 0
  122.                     If TempGrid(x, tempy - 1) = 0 Then
  123.                         TempGrid(x, tempy - 1) = pass + 1
  124.                         tempy = tempy - 1
  125.                         If tempy < starty Then startx = tempy
  126.                         finished = 0
  127.                     Else
  128.                         tempy = 0
  129.                     End If
  130.                 Loop
  131.  
  132.                 tempy = y
  133.                 Do Until tempy = _Height - 1
  134.                     If TempGrid(x, tempy + 1) = 0 Then
  135.                         TempGrid(x, tempy + 1) = pass + 1
  136.                         tempy = tempy + 1
  137.                         If tempy > finishy Then finishy = tempy
  138.                         finished = 0
  139.                     Else
  140.                         tempy = _Height - 1
  141.                     End If
  142.                 Loop
  143.             End If
  144.     Next y, x
  145.  
  146. o = 0
  147. Do Until o >= m.SIZE
  148.     If _MemGet(m, m.OFFSET + o, _Unsigned Long) <> 0 Then _MemPut m1, m1.OFFSET + o, kolor As _UNSIGNED LONG
  149.     o = o + 4
  150.  
  151. '***
  152. SUB TIMERe                                                                     '***
  153. QueryPerformanceCounter (_OFFSET(PerformanceCount))                            '***
  154. dElapsedTime = (CDBL(PerformanceCount - StartingCount)) / Frequency            '***
  155. END SUB                                                                        '***
  156.  
  157.