cuDNN failing to initialize possible version mismatch
I'm running the code below trying to train a bpr neural network. I'm getting the error below.
"Failed to get convolution algorithm. This is probably because cuDNN failed to initialize, so try looking to see if a warning log message was printed above."
from googling it seems to indicate that the cudNN isn't initializing, maybe because I have the wrong versions of tensorflow-gpu, cudNN, and cudatoolkit. Can anyone tell me if my versions are correct or perhaps what else the issue might be and how to solve it? the code was originally written for tensorflow 1.3 but I'm using python 3.6 and they appear to be incompatible. so I tried to use the lowest version I could get to run on 3.6.
tensorboard 1.15.0 pyhb230dea_0
tensorflow 1.15.0 gpu_py36h5a509aa_0
tensorflow-base 1.15.0 gpu_py36h9dcbed7_0
tensorflow-estimator 1.15.1 pyh2649769_0
tensorflow-gpu 1.15.0
cudatoolkit 10.0.130 0 anaconda
cudnn 7.6.5 cuda10.0_0
code:
f=open('DVBPR.log','w')
config = tf.ConfigProto(log_device_placement=False,allow_soft_placement=True)
sess=tf.Session(config=config)
sess.run(init)
t=[0]*numldprocess
for i in range(numldprocess):
t[i] = threading.Thread(target=load_image_async)
t[i].daemon=True
t[i].start()
oneiteration = 0
for item in user_train: oneiteration+=len(user_train[item])
step = 1
saver = tf.train.Saver([k for k in tf.global_variables() if k.name.startswith('DVBPR')])
epoch=0
while step * batch_size <= training_epoch*oneiteration+1:
sess.run(optimizer, feed_dict={keep_prob: dropout})
print('Step#'+str(step)+' CNN update')
if step*batch_size / oneiteration >epoch:
epoch+=1
saver.save(sess,'./DVBPR_auc_'+str(K)+'_'+str(step)+'.ckpt')
auc_valid,auc_test=Evaluation(step)
print('Epoch #'+str(epoch)+':'+str(auc_test)+' '+str(auc_valid)+'\n')
f.write('Epoch #'+str(epoch)+':'+str(auc_test)+' '+str(auc_valid)+'\n')
f.flush()
step += 1
print("Optimization Finished!")
error:
---------------------------------------------------------------------------
UnknownError Traceback (most recent call last)
~/anaconda3/envs/sb_dvbpr/lib/python3.6/site-packages/tensorflow_core/python/client/session.py in _do_call(self, fn, *args)
1364 try:
-> 1365 return fn(*args)
1366 except errors.OpError as e:
~/anaconda3/envs/sb_dvbpr/lib/python3.6/site-packages/tensorflow_core/python/client/session.py in _run_fn(feed_dict, fetch_list, target_list, options, run_metadata)
1349 return self._call_tf_sessionrun(options, feed_dict, fetch_list,
-> 1350 target_list, run_metadata)
1351
~/anaconda3/envs/sb_dvbpr/lib/python3.6/site-packages/tensorflow_core/python/client/session.py in _call_tf_sessionrun(self, options, feed_dict, fetch_list, target_list, run_metadata)
1442 fetch_list, target_list,
-> 1443 run_metadata)
1444
UnknownError: Failed to get convolution algorithm. This is probably because cuDNN failed to initialize, so try looking to see if a warning log message was printed above.
[[{{node DVBPR/Conv2D}}]]
During handling of the above exception, another exception occurred:
UnknownError Traceback (most recent call last)
<ipython-input-16-4a27b3e61c3e> in <module>
21 while step * batch_size <= training_epoch*oneiteration+1:
22
---> 23 sess.run(optimizer, feed_dict={keep_prob: dropout})
24
25 print('Step#'+str(step)+' CNN update')
~/anaconda3/envs/sb_dvbpr/lib/python3.6/site-packages/tensorflow_core/python/client/session.py in run(self, fetches, feed_dict, options, run_metadata)
954 try:
955 result = self._run(None, fetches, feed_dict, options_ptr,
--> 956 run_metadata_ptr)
957 if run_metadata:
958 proto_data = tf_session.TF_GetBuffer(run_metadata_ptr)
~/anaconda3/envs/sb_dvbpr/lib/python3.6/site-packages/tensorflow_core/python/client/session.py in _run(self, handle, fetches, feed_dict, options, run_metadata)
1178 if final_fetches or final_targets or (handle and feed_dict_tensor):
1179 results = self._do_run(handle, final_targets, final_fetches,
-> 1180 feed_dict_tensor, options, run_metadata)
1181 else:
1182 results = []
~/anaconda3/envs/sb_dvbpr/lib/python3.6/site-packages/tensorflow_core/python/client/session.py in _do_run(self, handle, target_list, fetch_list, feed_dict, options, run_metadata)
1357 if handle is None:
1358 return self._do_call(_run_fn, feeds, fetches, targets, options,
-> 1359 run_metadata)
1360 else:
1361 return self._do_call(_prun_fn, handle, feeds, fetches)
~/anaconda3/envs/sb_dvbpr/lib/python3.6/site-packages/tensorflow_core/python/client/session.py in _do_call(self, fn, *args)
1382 '\nsession_config.graph_options.rewrite_options.'
1383 'disable_meta_optimizer = True')
-> 1384 raise type(e)(node_def, op, message)
1385
1386 def _extend_graph(self):
UnknownError: Failed to get convolution algorithm. This is probably because cuDNN failed to initialize, so try looking to see if a warning log message was printed above.
[[node DVBPR/Conv2D (defined at /home/username/anaconda3/envs/sb_dvbpr/lib/python3.6/site-packages/tensorflow_core/python/framework/ops.py:1748) ]]
Original stack trace for 'DVBPR/Conv2D':
File "/home/username/anaconda3/envs/sb_dvbpr/lib/python3.6/runpy.py", line 193, in _run_module_as_main
"__main__", mod_spec)
File "/home/username/anaconda3/envs/sb_dvbpr/lib/python3.6/runpy.py", line 85, in _run_code
exec(code, run_globals)
File "/home/username/anaconda3/envs/sb_dvbpr/lib/python3.6/site-packages/ipykernel_launcher.py", line 16, in <module>
app.launch_new_instance()
File "/home/username/anaconda3/envs/sb_dvbpr/lib/python3.6/site-packages/traitlets/config/application.py", line 664, in launch_instance
app.start()
File "/home/username/anaconda3/envs/sb_dvbpr/lib/python3.6/site-packages/ipykernel/kernelapp.py", line 612, in start
self.io_loop.start()
File "/home/username/anaconda3/envs/sb_dvbpr/lib/python3.6/site-packages/tornado/platform/asyncio.py", line 149, in start
self.asyncio_loop.run_forever()
File "/home/username/anaconda3/envs/sb_dvbpr/lib/python3.6/asyncio/base_events.py", line 442, in run_forever
self._run_once()
File "/home/username/anaconda3/envs/sb_dvbpr/lib/python3.6/asyncio/base_events.py", line 1462, in _run_once
handle._run()
File "/home/username/anaconda3/envs/sb_dvbpr/lib/python3.6/asyncio/events.py", line 145, in _run
self._callback(*self._args)
File "/home/username/anaconda3/envs/sb_dvbpr/lib/python3.6/site-packages/tornado/ioloop.py", line 690, in <lambda>
lambda f: self._run_callback(functools.partial(callback, future))
File "/home/username/anaconda3/envs/sb_dvbpr/lib/python3.6/site-packages/tornado/ioloop.py", line 743, in _run_callback
ret = callback()
File "/home/username/anaconda3/envs/sb_dvbpr/lib/python3.6/site-packages/tornado/gen.py", line 787, in inner
self.run()
File "/home/username/anaconda3/envs/sb_dvbpr/lib/python3.6/site-packages/tornado/gen.py", line 748, in run
yielded = self.gen.send(value)
File "/home/username/anaconda3/envs/sb_dvbpr/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 365, in process_one
yield gen.maybe_future(dispatch(*args))
File "/home/username/anaconda3/envs/sb_dvbpr/lib/python3.6/site-packages/tornado/gen.py", line 209, in wrapper
yielded = next(result)
File "/home/username/anaconda3/envs/sb_dvbpr/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 268, in dispatch_shell
yield gen.maybe_future(handler(stream, idents, msg))
File "/home/username/anaconda3/envs/sb_dvbpr/lib/python3.6/site-packages/tornado/gen.py", line 209, in wrapper
yielded = next(result)
File "/home/username/anaconda3/envs/sb_dvbpr/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 545, in execute_request
user_expressions, allow_stdin,
File "/home/username/anaconda3/envs/sb_dvbpr/lib/python3.6/site-packages/tornado/gen.py", line 209, in wrapper
yielded = next(result)
File "/home/username/anaconda3/envs/sb_dvbpr/lib/python3.6/site-packages/ipykernel/ipkernel.py", line 306, in do_execute
res = shell.run_cell(code, store_history=store_history, silent=silent)
File "/home/username/anaconda3/envs/sb_dvbpr/lib/python3.6/site-packages/ipykernel/zmqshell.py", line 536, in run_cell
return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
File "/home/username/anaconda3/envs/sb_dvbpr/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2867, in run_cell
raw_cell, store_history, silent, shell_futures)
File "/home/username/anaconda3/envs/sb_dvbpr/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2895, in _run_cell
return runner(coro)
File "/home/username/anaconda3/envs/sb_dvbpr/lib/python3.6/site-packages/IPython/core/async_helpers.py", line 68, in _pseudo_sync_runner
coro.send(None)
File "/home/username/anaconda3/envs/sb_dvbpr/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3072, in run_cell_async
interactivity=interactivity, compiler=compiler, result=result)
File "/home/username/anaconda3/envs/sb_dvbpr/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3263, in run_ast_nodes
if (await self.run_code(code, result, async_=asy)):
File "/home/username/anaconda3/envs/sb_dvbpr/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3343, in run_code
exec(code_obj, self.user_global_ns, self.user_ns)
File "<ipython-input-14-e6fb5ccc089e>", line 29, in <module>
result1 = CNN(image1,dropout)
File "<ipython-input-11-d043bee5e9ec>", line 13, in CNN
conv1 = conv2d(x, Weights('wc1'), Biases('bc1'), strides=4)
File "<ipython-input-10-cdb0788297c6>", line 4, in conv2d
x = tf.nn.conv2d(x, W, strides=[1, strides, strides, 1], padding='SAME')
File "/home/username/anaconda3/envs/sb_dvbpr/lib/python3.6/site-packages/tensorflow_core/python/ops/nn_ops.py", line 2010, in conv2d
name=name)
File "/home/username/anaconda3/envs/sb_dvbpr/lib/python3.6/site-packages/tensorflow_core/python/ops/gen_nn_ops.py", line 1071, in conv2d
data_format=data_format, dilations=dilations, name=name)
File "/home/username/anaconda3/envs/sb_dvbpr/lib/python3.6/site-packages/tensorflow_core/python/framework/op_def_library.py", line 794, in _apply_op_helper
op_def=op_def)
File "/home/username/anaconda3/envs/sb_dvbpr/lib/python3.6/site-packages/tensorflow_core/python/util/deprecation.py", line 507, in new_func
return func(*args, **kwargs)
File "/home/username/anaconda3/envs/sb_dvbpr/lib/python3.6/site-packages/tensorflow_core/python/framework/ops.py", line 3357, in create_op
attrs, op_def, compute_device)
File "/home/username/anaconda3/envs/sb_dvbpr/lib/python3.6/site-packages/tensorflow_core/python/framework/ops.py", line 3426, in _create_op_internal
op_def=op_def)
File "/home/username/anaconda3/envs/sb_dvbpr/lib/python3.6/site-packages/tensorflow_core/python/framework/ops.py", line 1748, in __init__
self._traceback = tf_stack.extract_stack()