How to create performance timeline for tf.data.Dataset?

I'm trying to see what takes so long in my Tensorflow Dataset pipline, unfortunately when I run profiling the whole execution of my Dataset is covered by one operation: "IteratorGetNext". Is there a way to peek inside of the Dataset graph to see each map separately?

Here is a minimalistic example that can be made to run faster by adding num_parallel_calls unfortunately one can't tell that from the timeline as the whole operation appears (see screenshot)

import tensorflow as tf
from tensorflow.python.ops import io_ops
from tensorflow.contrib.framework.python.ops import audio_ops

g = tf.Graph()
with g.as_default():
  ds = tf.data.Dataset.list_files("work/input/train/audio/**/*.wav")
  ds = (ds
        .map(lambda x: io_ops.read_file(x))
        .map(lambda x: audio_ops.decode_wav(x,
                                 desired_channels=1,
                                 desired_samples=16000))
        .batch(30*1000)
        .prefetch(2)
  )

  iterator = ds.make_one_shot_iterator()
  get_next = iterator.get_next()


run_metadata = tf.RunMetadata()
run_config = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)


with tf.Session(graph=g) as sess:
  sess.run(get_next,
           options=tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE),
           run_metadata=run_metadata)

from tensorflow.python.client import timeline
trace = timeline.Timeline(step_stats=run_metadata.step_stats)

trace_file = open('timelines/example.json', 'w')
trace_file.write(trace.generate_chrome_trace_format())
trace_file.close()

enter image description here