Redis Redis slave not sync with new master after reboot

Hi All, I'm running redis-serntinel version 6.0.9 in a docker swarm environment. I'm running the HA configuration composed by:

1 Master 2 Replicas 3 Sentinels

with non affinity labels on 3 availability zones. docker-compose.yaml

version: '3.7'
networks:
  private_network:
    external: true
  private_interlock_network:
    external: true
configs:
  ha-proxy-config:
    external: true
  master-config:
    external: true
  replica-config:
    external: true
  sentinel-config:
    external: true
volumes:
  master-volume:
    driver: local
    driver_opts:
      device: /docker_volumes/local/redis_1
      o: bind
      type: none
  replica1-volume:
    driver: local
    driver_opts:
      device: /docker_volumes/local/redis_2
      o: bind
      type: none
  replica2-volume:
    driver: local
    driver_opts:
      device: /docker_volumes/local/redis_3
      o: bind
      type: none
services:
  haproxy:
    image: haproxy-2.3.2
    networks:
      - private_network
    ports:
      - 6379
    stop_grace_period: 1m30s
    healthcheck:
      interval: 30s
      retries: 3
      test: haproxy -v || exit 1
      timeout: 10s
    depends_on:
      - redis-mst
      - redis-rpl-1
      - redis-rpl-2
    deploy:
      replicas: 2
      resources:
        reservations:
          cpus: '0.1'
          memory: 32M
        limits:
          cpus: '1'
          memory: 512M
      restart_policy:
        condition: on-failure
        delay: 40s
        window: 220s
      update_config:
        parallelism: 1
        delay: 30s
        order: start-first
    configs:
      - source: ha-proxy-config
        target: /tmp/haproxy/config.tar.gz
        mode: 0555
  redis-mst:
    image: redis-6.0.9
    volumes:
    - source: master-volume
      target: /data
      type: volume
    networks:
      - private_network
    ports:
      - 6379
    stop_grace_period: 1m30s
    healthcheck:
      interval: 30s
      retries: 3
      test: redis-cli ping || exit 1
      timeout: 10s
    deploy:
      replicas: 1
      placement:
        constraints:
        - node.role==worker
        - node.labels.com.docker.ucp.collection==shared
        - node.labels.split.availability.zone==zone1
      resources:
        reservations:
          cpus: '0.1'
          memory: 32M
        limits:
          cpus: '1'
          memory: 8g
      restart_policy:
        condition: on-failure
        delay: 40s
        window: 220s
      update_config:
        parallelism: 1
        delay: 30s
        order: start-first
    configs:
      - source: master-config
        target: /tmp/redis/config.tar.gz
        mode: 0555
  redis-rpl-1:
    image: redis-6.0.9
    volumes:
    - source: replica1-volume
      target: /data
      type: volume
    networks:
      - private_network
    ports:
      - 6379
    stop_grace_period: 1m30s
    healthcheck:
      interval: 30s
      retries: 3
      test: redis-cli ping || exit 1
      timeout: 10s
    depends_on:
      - redis-mst
    deploy:
      replicas: 1
      placement:
        constraints:
        - node.role==worker
        - node.labels.com.docker.ucp.collection==shared
        - node.labels.split.availability.zone==zone2
      resources:
        reservations:
          cpus: '0.1'
          memory: 32M
        limits:
          cpus: '1'
          memory: 8g
      restart_policy:
        condition: on-failure
        delay: 40s
        window: 220s
      update_config:
        parallelism: 1
        delay: 30s
        order: start-first
    configs:
      - source: replica-config
        target: /tmp/redis/config.tar.gz
        mode: 0555
  redis-rpl-2:
    image: redis-6.0.9
    volumes:
    - source: replica2-volume
      target: /data
      type: volume
    networks:
      - private_network
    ports:
      - 6379
    stop_grace_period: 1m30s
    healthcheck:
      interval: 30s
      retries: 3
      test: redis-cli ping || exit 1
      timeout: 10s
    depends_on:
      - redis-mst
    deploy:
      replicas: 1
      placement:
        constraints:
        - node.role==worker
        - node.labels.com.docker.ucp.collection==shared
        - node.labels.split.availability.zone==zone3
      resources:
        reservations:
          cpus: '0.1'
          memory: 32M
        limits:
          cpus: '1'
          memory: 8g
      restart_policy:
        condition: on-failure
        delay: 40s
        window: 220s
      update_config:
        parallelism: 1
        delay: 30s
        order: start-first
    configs:
      - source: replica-config
        target: /tmp/redis/config.tar.gz
        mode: 0555
  redis-snt-0:
    image: redis-6.0.9
    networks:
      - private_network
    ports:
      - 6379
    stop_grace_period: 1m30s
    healthcheck:
      interval: 30s
      retries: 3
      test: redis-cli -p 26379 SENTINEL sentinels MASTERSET
      timeout: 10s
    depends_on:
      - redis-mst
      - redis-rpl-1
      - redis-rpl-2
    environment: 
      REDIS_TYPE: 'redis-sentinel'
    deploy:
      replicas: 1
      placement:
        constraints:
        - node.role==worker
        - node.labels.com.docker.ucp.collection==shared
        - node.labels.split.availability.zone==zone1
      resources:
        reservations:
          cpus: '0.1'
          memory: 32M
        limits:
          cpus: '1'
          memory: 512m
      restart_policy:
        condition: on-failure
        delay: 40s
        window: 220s
      update_config:
        parallelism: 1
        delay: 30s
        order: start-first
    configs:
      - source: sentinel-config
        target: /tmp/redis/config.tar.gz
        mode: 0555
  redis-snt-1:
    image: redis-6.0.9
    networks:
      - private_network
    ports:
      - 6379
    stop_grace_period: 1m30s
    healthcheck:
      interval: 30s
      retries: 3
      test: redis-cli -p 26379 SENTINEL sentinels MASTERSET
      timeout: 10s
    depends_on:
      - redis-mst
      - redis-rpl-1
      - redis-rpl-2
    environment: 
      REDIS_TYPE: 'redis-sentinel'
    deploy:
      replicas: 1
      placement:
        constraints:
        - node.role==worker
        - node.labels.com.docker.ucp.collection==shared
        - node.labels.split.availability.zone==zone2
      resources:
        reservations:
          cpus: '0.1'
          memory: 32M
        limits:
          cpus: '1'
          memory: 512m
      restart_policy:
        condition: on-failure
        delay: 40s
        window: 220s
      update_config:
        parallelism: 1
        delay: 30s
        order: start-first
    configs:
      - source: sentinel-config
        target: /tmp/redis/config.tar.gz
        mode: 0555
  redis-snt-2:
    image: redis-6.0.9
    networks:
      - private_network
    ports:
      - 6379
    stop_grace_period: 1m30s
    healthcheck:
      interval: 30s
      retries: 3
      test: redis-cli -p 26379 SENTINEL sentinels MASTERSET
      timeout: 10s
    depends_on:
      - redis-mst
      - redis-rpl-1
      - redis-rpl-2
    environment: 
      REDIS_TYPE: 'redis-sentinel'
    deploy:
      replicas: 1
      placement:
        constraints:
        - node.role==worker
        - node.labels.com.docker.ucp.collection==shared
        - node.labels.split.availability.zone==zone3
      resources:
        reservations:
          cpus: '0.1'
          memory: 32M
        limits:
          cpus: '1'
          memory: 512m
      restart_policy:
        condition: on-failure
        delay: 40s
        window: 220s
      update_config:
        parallelism: 1
        delay: 30s
        order: start-first
    configs:
      - source: sentinel-config
        target: /tmp/redis/config.tar.gz
        mode: 0555

Replicas configuration:

replicaof redis-mst 6379
maxmemory-policy noeviction
maxmemory 6800mb
loglevel notice
appendonly yes
replica-announce-ip {{HOST_IP}} <-- this is managed to be replaced with replica container ip

Sentinel configuration:

loglevel verbose
sentinel monitor mymaster redis-mst 6379 2
sentinel down-after-milliseconds mymaster 5000
sentinel failover-timeout mymaster 10000
sentinel parallel-syncs mymaster 1
sentinel announce-ip {{HOST_IP}}    <-- this is managed to be replaced with sentinel container ip
 ```
All 6 services coming up successfully and the redis sentinel seems bootstrapping as expected but when I starts HA tests something goes wrong. Let me try to describe the issue with the following steps:

1) I'm putting in sleep the master container with the command:
```redis-cli -p 6379 DEBUG sleep 30```

I see that this will put the master in unavailable state and sentinel rating for a new master.
I see as expected that one of two replicas (in that case REPLICA 2) is elected a new master and I verifing that login a one of those replica container an run the command:

```redis-cli INFO replication``` or in the logs:

11:M 05 May 2021 10:05:00.468 # Connection with master lost. 11:M 05 May 2021 10:05:00.468 * Caching the disconnected master state. 11:M 05 May 2021 10:05:00.468 * Discarding previously cached master state. 11:M 05 May 2021 10:05:00.468 # Setting secondary replication ID to 87517fb9e5fbbd65de2befb6f0432c7a228d0784, valid up to offset: 387411. New replication ID is 3aa0ae451c4bb88860a69fe57dd2c1dba7816456 11:M 05 May 2021 10:05:00.468 * MASTER MODE enabled (user request from 'id=10 addr=10.0.168.134:35532 fd=10 name=sentinel-8b0d926b-cmd age=1251 idle=0 flags=x db=0 sub=0 psub=0 multi=4 qbuf=188 qbuf-free=32580 argv-mem=4 obl=45 oll=0 omem=0 tot-mem=61468 events=r cmd=exec user=default') 11:M 05 May 2021 10:05:00.472 # CONFIG REWRITE executed with success. 11:M 05 May 2021 10:05:01.945 * Replica 10.0.168.116:6379 asks for synchronization 11:M 05 May 2021 10:05:01.945 * Partial resynchronization request from 10.0.168.116:6379 accepted. Sending 579 bytes of backlog starting from offset 387411. 11:M 05 May 2021 10:05:35.205 * Replica 10.0.168.108:6379 asks for synchronization 11:M 05 May 2021 10:05:35.205 * Partial resynchronization not accepted: Requested offset for second ID was 461054, but I can reply up to 387411 11:M 05 May 2021 10:05:35.205 * Starting BGSAVE for SYNC with target: disk 11:M 05 May 2021 10:05:35.205 * Background saving started by pid 289 289:C 05 May 2021 10:05:35.210 * DB saved on disk 289:C 05 May 2021 10:05:35.211 * RDB: 0 MB of memory used by copy-on-write 11:M 05 May 2021 10:05:35.245 * Background saving terminated with success 11:M 05 May 2021 10:05:35.247 * Synchronization with replica 10.0.168.108:6379 succeeded


REPLICA 1 container is updating its configuration also because in the logs I also see the following:

11:S 05 May 2021 10:05:01.353 # Connection with master lost. 11:S 05 May 2021 10:05:01.353 * Caching the disconnected master state. 11:S 05 May 2021 10:05:01.353 * REPLICAOF 10.0.168.128:6379 enabled (user request from 'id=9 addr=10.0.168.134:50330 fd=9 name=sentinel-8b0d926b-cmd age=1252 idle=0 flags=x db=0 sub=0 psub=0 multi=4 qbuf=339 qbuf-free=32429 argv-mem=4 obl=45 oll=0 omem=0 tot-mem=61468 events=r cmd=exec user=default') 11:S 05 May 2021 10:05:01.360 # CONFIG REWRITE executed with success. 11:S 05 May 2021 10:05:01.935 * Connecting to MASTER 10.0.168.128:6379 11:S 05 May 2021 10:05:01.935 * MASTER <-> REPLICA sync started 11:S 05 May 2021 10:05:01.935 * Non blocking connect for SYNC fired the event. 11:S 05 May 2021 10:05:01.937 * Master replied to PING, replication can continue... 11:S 05 May 2021 10:05:01.943 * Trying a partial resynchronization (request 87517fb9e5fbbd65de2befb6f0432c7a228d0784:387411). 11:S 05 May 2021 10:05:01.945 * Successful partial resynchronization with master. 11:S 05 May 2021 10:05:01.945 # Master replication ID changed to 3aa0ae451c4bb88860a69fe57dd2c1dba7816456 11:S 05 May 2021 10:05:01.945 * MASTER <-> REPLICA sync: Master accepted a Partial Resynchronization.


Now, if I rm the REPLICA 1 container and forcing it to restart there will be the problem.
Infact, after the new REPLICA 1 container comes up it will join with the OLD MASTER and not update its configuration in order to point to a newest MASTER one.

Infact in the new REPLICA 1 container logs I see:

11:C 05 May 2021 10:17:36.987 # oO0OoO0OoO0Oo Redis is starting oO0OoO0OoO0Oo 11:C 05 May 2021 10:17:36.988 # Redis version=6.0.9, bits=64, commit=00000000, modified=0, pid=11, just started 11:C 05 May 2021 10:17:36.988 # Configuration loaded 11:S 05 May 2021 10:17:36.989 * Running mode=standalone, port=6379. 11:S 05 May 2021 10:17:36.989 # WARNING: The TCP backlog setting of 511 cannot be enforced because /proc/sys/net/core/somaxconn is set to the lower value of 128. 11:S 05 May 2021 10:17:36.989 # Server initialized 11:S 05 May 2021 10:17:36.989 # WARNING you have Transparent Huge Pages (THP) support enabled in your kernel. This will create latency and memory usage issues with Redis. To fix this issue run the command 'echo madvise > /sys/kernel/mm/transparent_hugepage/enabled' as root, and add it to your /etc/rc.local in order to retain the setting after a reboot. Redis must be restarted after THP is disabled (set to 'madvise' or 'never'). 11:S 05 May 2021 10:17:37.003 * Reading RDB preamble from AOF file... 11:S 05 May 2021 10:17:37.003 * Loading RDB produced by version 6.0.9 11:S 05 May 2021 10:17:37.003 * RDB age 2022 seconds 11:S 05 May 2021 10:17:37.003 * RDB memory usage when created 1.79 Mb 11:S 05 May 2021 10:17:37.003 * RDB has an AOF tail 11:S 05 May 2021 10:17:37.003 * Reading the remaining AOF tail... 11:S 05 May 2021 10:17:37.038 * DB loaded from append only file: 0.049 seconds 11:S 05 May 2021 10:17:37.038 * Ready to accept connections 11:S 05 May 2021 10:17:37.039 * Connecting to MASTER redis-mst:6379 11:S 05 May 2021 10:17:37.040 * MASTER <-> REPLICA sync started 11:S 05 May 2021 10:17:37.040 * Non blocking connect for SYNC fired the event. 11:S 05 May 2021 10:17:37.041 * Master replied to PING, replication can continue... 11:S 05 May 2021 10:17:37.042 * Partial resynchronization not possible (no cached master) 11:S 05 May 2021 10:17:37.044 * Full resync from master: 3aa0ae451c4bb88860a69fe57dd2c1dba7816456:567430 11:S 05 May 2021 10:17:37.108 * MASTER <-> REPLICA sync: receiving 73696 bytes from master to disk 11:S 05 May 2021 10:17:37.113 * MASTER <-> REPLICA sync: Flushing old data 11:S 05 May 2021 10:17:37.113 * MASTER <-> REPLICA sync: Loading DB in memory 11:S 05 May 2021 10:17:37.117 * Loading RDB produced by version 6.0.9 11:S 05 May 2021 10:17:37.117 * RDB age 0 seconds 11:S 05 May 2021 10:17:37.117 * RDB memory usage when created 2.23 Mb 11:S 05 May 2021 10:17:37.118 * MASTER <-> REPLICA sync: Finished with success 11:S 05 May 2021 10:17:37.119 * Background append only file rewriting started by pid 17 11:S 05 May 2021 10:17:37.144 * AOF rewrite child asks to stop sending diffs. 17:C 05 May 2021 10:17:37.144 * Parent agreed to stop sending diffs. Finalizing AOF... 17:C 05 May 2021 10:17:37.144 * Concatenating 0.00 MB of AOF diff received from parent. 17:C 05 May 2021 10:17:37.145 * SYNC append only file rewrite performed 17:C 05 May 2021 10:17:37.146 * AOF rewrite: 0 MB of memory used by copy-on-write 11:S 05 May 2021 10:17:37.240 * Background AOF rewrite terminated with success 11:S 05 May 2021 10:17:37.241 * Residual parent diff successfully flushed to the rewritten AOF (0.00 MB) 11:S 05 May 2021 10:17:37.242 * Background AOF rewrite finished successfully


and if I see the logs of OLD-MASTER (now elected a slave) I see the following logs:

11:S 05 May 2021 10:17:37.043 * Replica 10.0.168.151:6379 asks for synchronization 11:S 05 May 2021 10:17:37.043 * Full resync requested by replica 10.0.168.151:6379 11:S 05 May 2021 10:17:37.043 * Starting BGSAVE for SYNC with target: disk 11:S 05 May 2021 10:17:37.044 * Background saving started by pid 446 446:C 05 May 2021 10:17:37.056 * DB saved on disk 446:C 05 May 2021 10:17:37.057 * RDB: 0 MB of memory used by copy-on-write 11:S 05 May 2021 10:17:37.107 * Background saving terminated with success 11:S 05 May 2021 10:17:37.112 * Synchronization with replica 10.0.168.151:6379 succeeded ``` Why this behaviour, now I have a slave of a slave ?

This behaviour is the same with and without persistance. At starting I supposed that the persistance could be the issue but seems that isn't it. My expactation is that, after reboot REPLICA1 must know which is the new Master, am I missing something else?

I'm stucked and I don't know which other tests I could perform here. Hope that someone can give me a feedback and help to fix this for my production environment. thanks in advance.