ignite client is taking long time to start when we are connecting to multiple nodes

scenario , i have two server nodes in beginning and when we are trying to connect client nodes taking 15+ min to start client. please find below server configuration. only change is IP address for another server nd, and on console i am getting below error thanks in advance

[12:42:10] Possible failure suppressed accordingly to a configured handler [hnd=StopNodeOrHaltFailureHandler [tryStop=false, timeout=0, super=AbstractFailureHandler [ignoredFailureTypes=UnmodifiableSet [SYSTEM_WORKER_BLOCKED, SYSTEM_CRITICAL_OPERATION_TIMEOUT]]], failureCtx=FailureContext [type=SYSTEM_WORKER_BLOCKED, err=class o.a.i.IgniteException: GridWorker [name=tcp-comm-worker, igniteInstanceName=null, finished=false, heartbeatTs=1600672317715]]] [12:42:40,486][SEVERE][tcp-disco-msg-worker-[5023dc59 172.16.0.189:48510]-#2][G] Blocked system-critical thread has been detected. This can lead to cluster-wide undefined behaviour [workerName=tcp-comm-worker, threadName=tcp-comm-worker-#1, blockedFor=18s] [12:42:40] Possible failure suppressed accordingly to a configured handler [hnd=StopNodeOrHaltFailureHandler [tryStop=false, timeout=0, super=AbstractFailureHandler [ignoredFailureTypes=UnmodifiableSet [SYSTEM_WORKER_BLOCKED, SYSTEM_CRITICAL_OPERATION_TIMEOUT]]], failureCtx=FailureContext [type=SYSTEM_WORKER_BLOCKED, err=class o.a.i.IgniteException: GridWorker [name=tcp-comm-worker, igniteInstanceName=null, finished=false, heartbeatTs=1600672341604]]] [12:42:49,498][SEVERE][tcp-disco-msg-worker-[5023dc59 172.16.0.189:48510]-#2][G] Blocked system-critical thread has been detected. This can lead to cluster-wide undefined behaviour [workerName=tcp-comm-worker, threadName=tcp-comm-worker-#1, blockedFor=27s] [12:42:49] Possible failure suppressed accordingly to a configured handler [hnd=StopNodeOrHaltFailureHandler [tryStop=false, timeout=0, super=AbstractFailureHandler [ignoredFailureTypes=UnmodifiableSet [SYSTEM_WORKER_BLOCKED, SYSTEM_CRITICAL_OPERATION_TIMEOUT]]], failureCtx=FailureContext [type=SYSTEM_WORKER_BLOCKED, err=class o.a.i.IgniteException: GridWorker [name=tcp-comm-worker, igniteInstanceName=null, finished=false, heartbeatTs=1600672341604]]] [12:43:01,603][SEVERE][tcp-disco-msg-worker-[5023dc59 172.16.0.189:48510]-#2][G] Blocked system-critical thread has been detected. This can lead to cluster-wide undefined behaviour [workerName=tcp-comm-worker, threadName=tcp-comm-worker-#1, blockedFor=39s]

`` -->

-->
<!--    <property name="consistentId" value="#{ systemEnvironment['IGNITE_CONSISTENT_ID'] }" />   -->
    <!-- Enable task execution events for examples. -->



<property name="dataStorageConfiguration">
                    <bean class="org.apache.ignite.configuration.DataStorageConfiguration">
                            <property name="defaultDataRegionConfiguration">
                                    <bean class="org.apache.ignite.configuration.DataRegionConfiguration">
                                               <property name="persistenceEnabled" value="true" />
                                          <property name="maxSize" value="#{4L * 1024 * 1024 * 1024}"/>
                                            <property name="initialSize" value="#{1L * 1024 * 1024 * 1024}"/> 
                                        
                                    </bean>
                            </property>
                    </bean>
            </property>
           <!-- Explicitly configure TCP discovery SPI to provide list of initial nodes. -->
    <property name="discoverySpi">
        <bean class="org.apache.ignite.spi.discovery.tcp.TcpDiscoverySpi">
            <property name="localPort" value="48510"/>
        
            <property name="ipFinder">
                <!--
                    Ignite provides several options for automatic discovery that can be used
                    instead os static IP based discovery. For information on all options refer
                    to our documentation: http://apacheignite.readme.io/docs/cluster-config
                -->
                <!-- Uncomment static IP finder to enable static-based discovery of initial nodes. -->
            <bean class="org.apache.ignite.spi.discovery.tcp.ipfinder.vm.TcpDiscoveryVmIpFinder">
              <!--   <bean class="org.apache.ignite.spi.discovery.tcp.ipfinder.multicast.TcpDiscoveryMulticastIpFinder"> -->
                    <property name="addresses">
                        <list>
                            <!-- In distributed environment, replace with actual host IP address. -->
                           
                        <value>127.0.0.1:48510..48512</value> 
                        <value>X.16.0.X:48510..48512</value> 
                        
                    
                            </list>
                    </property>
                </bean>
            </property>
        </bean>
    </property>
    <property name="communicationSpi">
        <bean class="org.apache.ignite.spi.communication.tcp.TcpCommunicationSpi">
            <property name="localPort" value="48110"/>
        <!--    <property name="localPortRange" value="1000"/> -->
        </bean>
    </property>
    <property name="clientConnectorConfiguration">
                    <bean class="org.apache.ignite.configuration.ClientConnectorConfiguration">
                      <property name="port" value="10801"/>
                    </bean>
    </property>
<property name="userAttributes">
        <map>
            <entry key="ROLE" value="SecindNode" />
            </map>
    
    </property>
 </bean>

``


Client Code

`` public final class IgniteConnectionUtil {

private static final Logger logger = Logger.getLogger(IgniteConnectionUtil.class);

private static IgniteConnectionUtil instance;
private static Ignite ignite;
private static String CACHE_NAME = "CollectionCache";
private static String jdbcThinHost = null;

private IgniteConnectionUtil() {
    if(ignite == null)
        init();
    try {
        boolean clearRedisMap = ConfigurationManager.getInstance().getPropertyAsBoolean("CLEAR_REDIS_MAP",
                "IN_MEMORY_DB", "CONFIG");
        if (clearRedisMap)
            InMemoryTableStore.getInstance().clearStore();
    } catch (Exception e) {
        logger.info("Unable to clear ignite-redis map");
    }
}

public static synchronized void init() {
    try {
        if(!isIgniteEnabled() || ignite != null)
            return;
        logger.info("Ignite Client starting");
        Ignition.setClientMode(true);

        DataStorageConfiguration storageCfg = new DataStorageConfiguration();
        storageCfg.setWalMode(WALMode.BACKGROUND);
        
        IgniteConfiguration cfg = new IgniteConfiguration();
        cfg.setDataStorageConfiguration(storageCfg);
        cfg.setPeerClassLoadingEnabled(true);

        TcpDiscoverySpi discoverySpi = new TcpDiscoverySpi();
        TcpDiscoveryVmIpFinder ipFinder = new TcpDiscoveryVmIpFinder();
        String serverIp = ConfigurationManager.getInstance()
                .getPropertyAsString("SERVER_ADDRESS", "IN_MEMORY_DB", "CONFIG");
        //ipFinder.setAddresses(Arrays.asList(serverIp));
        ipFinder.setAddresses(
                Arrays.asList("127.0.0.1:48510","127.0.0.1:48511","127.0.0.1:48512",
                        "X.16.0.189:48510","X.16.0.X:48511","X.16.0.X:48512"
                        
                        ));
        discoverySpi.setLocalPort(48510);
        // timeout for which client node will try to connect to ignite servers
        // it will throw exception and exit if server can not be found
        long discoveryTimeout = ConfigurationManager.getInstance()
                .getPropertyAsLong("DISCOVERY_TIMEOUT", "IN_MEMORY_DB", "CONFIG");
        discoverySpi.setIpFinder(ipFinder).setJoinTimeout(discoveryTimeout);
        
        TcpCommunicationSpi commSpi = new TcpCommunicationSpi(); 
        long communicationTimeout = ConfigurationManager.getInstance()
                .getPropertyAsLong("COMMUNICATION_TIMEOUT", "IN_MEMORY_DB", "CONFIG");
        commSpi.setConnectTimeout(communicationTimeout).setLocalPort(48110);
        
        // this timeout is used to reconnect client to server if server has failed/restarted
        long clientFailureDetectionTimeout = ConfigurationManager.getInstance()
                .getPropertyAsLong("CLIENT_FAILURE_DETECTION_TIMEOUT", "IN_MEMORY_DB", "CONFIG");
        cfg.setClientFailureDetectionTimeout(30000);
        
        cfg.setDiscoverySpi(discoverySpi); 
        
        cfg.setCommunicationSpi(commSpi);

        //cfg.setIncludeEventTypes(EventType.EVT_NODE_JOINED);

        ignite = Ignition.start(cfg);
        ignite.cluster().active(true);
        
        
        ignite.cluster().baselineAutoAdjustEnabled(true);
        ignite.cluster().baselineAutoAdjustTimeout(30000);
        
        initializeJDBCThinDriver();
        //igniteEventListen();
        logger.info("Ignite Client started");
        
    } catch (Exception e) {
        logger.error("Error in starting ignite cluster", e);
    }
}

public static synchronized IgniteConnectionUtil getInstance() {
    if (instance == null) {
        instance = new IgniteConnectionUtil();
    } else {
        try {
            if(ignite == null || ignite.cluster() == null) {
                logger.error("Illegal Ignite state. Will try to restart ignite clinet.");
                init();
            } else if(Ignition.state().equals(IgniteState.STOPPED_ON_SEGMENTATION)) {
                logger.error("Reconnecting to Ignite");
                ignite = null;
                init();
            }else if(!ignite.cluster().active())
                ignite.cluster().active(true);
        } catch(Exception e) {
            logger.error("Ignite Exception. Please restart ignite server.");
        }
    }
    return instance;
}

public static void initializeJDBCThinDriver() {
    try {
        Class.forName("org.apache.ignite.IgniteJdbcThinDriver");
        jdbcThinHost = ConfigurationManager.getInstance()
                .getPropertyAsString("JDBC_THIN_HOST", "IN_MEMORY_DB", "CONFIG");
    } catch (ClassNotFoundException e) {
        logger.error("Error in loading IgniteJdbcThinDriver class", e);
    } 
}

public Connection getJDBCConnection() {
    Connection conn = null;
    try {
        conn = DriverManager.getConnection("jdbc:ignite:thin://"+jdbcThinHost+"/");
        if(conn == null )
        {
            conn = DriverManager.getConnection("jdbc:ignite:thin://172.16.0.189:10801/");
        }
        
    } catch (SQLException e) {
        logger.error("Error in getting Ignite JDBC connection", e);
    }
    return conn;
}

public IgniteCache<?, ?> getOrCreateCache(String cacheName) {
    CacheConfiguration<?, ?> cacheConfig = new CacheConfiguration<>(CACHE_NAME);
    //cacheConfig.setDataRegionName("500MB_Region");
    cacheConfig.setCacheMode(CacheMode.PARTITIONED);
    cacheConfig.setBackups(1);
    cacheConfig.setRebalanceMode(CacheRebalanceMode.ASYNC);
    cacheConfig.setAtomicityMode(CacheAtomicityMode.ATOMIC);
    cacheConfig.setWriteSynchronizationMode(CacheWriteSynchronizationMode.PRIMARY_SYNC);
    cacheConfig.setReadFromBackup(true);
    cacheConfig.setCopyOnRead(true);
    cacheConfig.setOnheapCacheEnabled(true);
    

    
    cacheConfig.setSqlSchema("PUBLIC");
    
    if(ignite != null) {
        return ignite.getOrCreateCache(cacheConfig);
    }else {
        throw new IgniteSQLException("Internal Server Error Please contact support");
    }
}

public IgniteCache<?, ?> getOrCreateCache() {
    
    CacheConfiguration<?, ?> cacheConfig = new CacheConfiguration<>(CACHE_NAME);
    //cacheConfig.setDataRegionName("500MB_Region");
    cacheConfig.setCacheMode(CacheMode.PARTITIONED);
    cacheConfig.setBackups(1);
    cacheConfig.setRebalanceMode(CacheRebalanceMode.ASYNC);
    cacheConfig.setAtomicityMode(CacheAtomicityMode.ATOMIC);
    cacheConfig.setWriteSynchronizationMode(CacheWriteSynchronizationMode.PRIMARY_SYNC);
    
    cacheConfig.setReadFromBackup(true);
    cacheConfig.setCopyOnRead(true);
    cacheConfig.setOnheapCacheEnabled(true); 

    
    cacheConfig.setSqlSchema("PUBLIC");
    if(ignite != null) {
        return ignite.getOrCreateCache(cacheConfig);
    }else {
        throw new IgniteSQLException("Internal Server Error Please contact support");
    }
}

public static synchronized void shutdown() throws Exception {
    try {
        if(ignite != null) {
            ignite.close();
        }
    } catch(IgniteException ie) {
        throw new Exception(ie);
    } finally {
        ignite = null;
    }
}

public static boolean isIgniteEnabled() throws Exception {
    return ConfigurationManager.getInstance().getPropertyAsBoolean("ENABLED",
                "IN_MEMORY_DB");
}

} ``

1 answer

  • answered 2020-09-21 08:56 alamar

    Blocked system-critical thread has been detected. This can lead to cluster-wide undefined behaviour [workerName=tcp-comm-worker, threadName=tcp-comm-worker-#1, blockedFor=18s]

    This would likely mean that server node cannot connect to client's communication port (47100), or vice versa. In 2.8.1 or earlier, it needs to be traversable in both directions. In 2.9, new operation mode will be introduced where server will never try to connect to client, only the traditional way around.