Module note_seq.melody_inference
Infer melody from polyphonic NoteSequence.
Expand source code
# Copyright 2021 The Magenta Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Infer melody from polyphonic NoteSequence."""
import bisect
from note_seq import constants
from note_seq import sequences_lib
import numpy as np
import scipy.linalg
REST = -1
MELODY_VELOCITY = 127
# Maximum number of melody frames to infer.
MAX_NUM_FRAMES = 10000
def _melody_transition_distribution(rest_prob, interval_prob_fn):
"""Compute the transition distribution between melody pitches (and rest).
Args:
rest_prob: Probability that a note will be followed by a rest.
interval_prob_fn: Function from pitch interval (value between -127 and 127)
to weight. Will be normalized so that outgoing probabilities (including
rest) from each pitch sum to one.
Returns:
A 257-by-257 melody event transition matrix. Row/column zero represents
rest. Rows/columns 1-128 represent MIDI note onsets for all pitches.
Rows/columns 129-256 represent MIDI note continuations (i.e. non-onsets) for
all pitches.
"""
pitches = np.arange(constants.MIN_MIDI_PITCH, constants.MAX_MIDI_PITCH + 1)
num_pitches = len(pitches)
# Evaluate the probability of each possible pitch interval.
max_interval = constants.MAX_MIDI_PITCH - constants.MIN_MIDI_PITCH
intervals = np.arange(-max_interval, max_interval + 1)
interval_probs = np.vectorize(interval_prob_fn)(intervals)
# Form the note onset transition matrix.
interval_probs_mat = scipy.linalg.toeplitz(
interval_probs[max_interval::-1],
interval_probs[max_interval::])
interval_probs_mat /= interval_probs_mat.sum(axis=1)[:, np.newaxis]
interval_probs_mat *= 1 - rest_prob
num_melody_events = 1 + 2 * num_pitches
mat = np.zeros([num_melody_events, num_melody_events])
# Continuing a rest is a non-event.
mat[0, 0] = 1
# All note onsets are equally likely after rest.
mat[0, 1:num_pitches+1] = np.ones([1, num_pitches]) / num_pitches
# Transitioning to rest/onset follows user-specified distribution.
mat[1:num_pitches+1, 0] = rest_prob
mat[1:num_pitches+1, 1:num_pitches+1] = interval_probs_mat
# Sustaining a note after onset is a non-event. Transitioning to a different
# note (without onset) is forbidden.
mat[1:num_pitches+1, num_pitches+1:] = np.eye(num_pitches)
# Transitioning to rest/onset follows user-specified distribution.
mat[num_pitches+1:, 0] = rest_prob
mat[num_pitches+1:, 1:num_pitches+1] = interval_probs_mat
# Sustaining a note is a non-event. Transitioning to a different note (without
# onset) is forbidden.
mat[num_pitches+1:, num_pitches+1:] = np.eye(num_pitches)
return mat
def sequence_note_frames(sequence):
"""Split a NoteSequence into frame summaries separated by onsets/offsets.
Args:
sequence: The NoteSequence for which to compute frame summaries.
Returns:
pitches: A list of MIDI pitches present in `sequence`, in ascending order.
has_onsets: A Boolean matrix with shape `[num_frames, num_pitches]` where
entry (i,j) indicates whether pitch j has a note onset in frame i.
has_notes: A Boolean matrix with shape `[num_frames, num_pitches]` where
entry (i,j) indicates whether pitch j is present in frame i, either as
an onset or a sustained note.
event_times: A list of length `num_frames - 1` containing the event times
separating adjacent frames.
"""
notes = [note for note in sequence.notes
if not note.is_drum
and note.program not in constants.UNPITCHED_PROGRAMS]
onset_times = [note.start_time for note in notes]
offset_times = [note.end_time for note in notes]
event_times = set(onset_times + offset_times)
event_times.discard(0.0)
event_times.discard(sequence.total_time)
event_times = sorted(event_times)
num_frames = len(event_times) + 1
pitches = sorted(set(note.pitch for note in notes))
pitch_map = dict((p, i) for i, p in enumerate(pitches))
num_pitches = len(pitches)
has_onsets = np.zeros([num_frames, num_pitches], dtype=bool)
has_notes = np.zeros([num_frames, num_pitches], dtype=bool)
for note in notes:
start_frame = bisect.bisect_right(event_times, note.start_time)
end_frame = bisect.bisect_left(event_times, note.end_time)
has_onsets[start_frame, pitch_map[note.pitch]] = True
has_notes[start_frame:end_frame+1, pitch_map[note.pitch]] = True
return pitches, has_onsets, has_notes, event_times
def _melody_frame_log_likelihood(pitches, has_onsets, has_notes, durations,
instantaneous_non_max_pitch_prob,
instantaneous_non_empty_rest_prob,
instantaneous_missing_pitch_prob):
"""Compute the log-likelihood of each frame given each melody state."""
num_frames = len(has_onsets)
num_pitches = len(pitches)
# Whether or not each frame has any notes present at all.
any_notes = np.sum(has_notes, axis=1, dtype=bool)
# Whether or not each note has the maximum pitch in each frame.
if num_pitches > 1:
has_higher_notes = np.concatenate([
np.cumsum(has_notes[:, ::-1], axis=1, dtype=bool)[:, num_pitches-2::-1],
np.zeros([num_frames, 1], dtype=bool)
], axis=1)
else:
has_higher_notes = np.zeros([num_frames, 1], dtype=bool)
# Initialize the log-likelihood matrix. There are two melody states for each
# pitch (onset vs. non-onset) and one rest state.
mat = np.zeros([num_frames, 1 + 2 * num_pitches])
# Log-likelihood of each frame given rest. Depends only on presence of any
# notes.
mat[:, 0] = (
any_notes * instantaneous_non_empty_rest_prob +
~any_notes * (1 - instantaneous_non_empty_rest_prob))
# Log-likelihood of each frame given onset. Depends on presence of onset and
# whether or not it is the maximum pitch. Probability of no observed onset
# given melody onset is zero.
mat[:, 1:num_pitches+1] = has_onsets * (
~has_higher_notes * (1 - instantaneous_non_max_pitch_prob) +
has_higher_notes * instantaneous_non_max_pitch_prob)
# Log-likelihood of each frame given non-onset. Depends on absence of onset
# and whether note is present and the maximum pitch. Probability of observed
# onset given melody non-onset is zero; this is to prevent Viterbi from being
# "lazy" and always treating repeated notes as sustain.
mat[:, num_pitches+1:] = ~has_onsets * (
~has_higher_notes * (1 - instantaneous_non_max_pitch_prob) +
has_higher_notes * instantaneous_non_max_pitch_prob) * (
has_notes * (1 - instantaneous_missing_pitch_prob) +
~has_notes * instantaneous_missing_pitch_prob)
# Take the log and scale by duration.
mat = durations[:, np.newaxis] * np.log(mat)
return mat
def _melody_viterbi(pitches, melody_frame_loglik, melody_transition_loglik):
"""Use the Viterbi algorithm to infer a sequence of melody events."""
num_frames, num_melody_events = melody_frame_loglik.shape
assert num_melody_events == 2 * len(pitches) + 1
loglik_matrix = np.zeros([num_frames, num_melody_events])
path_matrix = np.zeros([num_frames, num_melody_events], dtype=np.int32)
# Assume the very first frame follows a rest.
loglik_matrix[0, :] = (
melody_transition_loglik[0, :] + melody_frame_loglik[0, :])
for frame in range(1, num_frames):
# At each frame, store the log-likelihood of the best sequence ending in
# each melody event, along with the index of the parent melody event from
# the previous frame.
mat = (np.tile(loglik_matrix[frame - 1][:, np.newaxis],
[1, num_melody_events]) +
melody_transition_loglik)
path_matrix[frame, :] = mat.argmax(axis=0)
loglik_matrix[frame, :] = (
mat[path_matrix[frame, :], range(num_melody_events)] +
melody_frame_loglik[frame])
# Reconstruct the most likely sequence of melody events.
path = [np.argmax(loglik_matrix[-1])]
for frame in range(num_frames, 1, -1):
path.append(path_matrix[frame - 1, path[-1]])
# Mapping from melody event index to rest or (pitch, is-onset) tuple.
def index_to_event(i):
if i == 0:
return REST
elif i <= len(pitches):
# Note onset.
return pitches[i - 1], True
else:
# Note sustain.
return pitches[i - len(pitches) - 1], False
return [index_to_event(index) for index in path[::-1]]
class MelodyInferenceError(Exception): # pylint:disable=g-bad-exception-name
pass
def infer_melody_for_sequence(sequence,
melody_interval_scale=2.0,
rest_prob=0.1,
instantaneous_non_max_pitch_prob=1e-15,
instantaneous_non_empty_rest_prob=0.0,
instantaneous_missing_pitch_prob=1e-15):
"""Infer melody for a NoteSequence.
This is a work in progress and should not necessarily be expected to return
reasonable results. It operates under two main assumptions:
1) Melody onsets always coincide with actual note onsets from the polyphonic
NoteSequence.
2) When multiple notes are active, the melody note tends to be the note with
the highest pitch.
Args:
sequence: The NoteSequence for which to infer melody. This NoteSequence will
be modified in place, with inferred melody notes added as a new
instrument.
melody_interval_scale: The scale parameter for the prior distribution over
melody intervals.
rest_prob: The probability of rest after a melody note.
instantaneous_non_max_pitch_prob: The instantaneous probability that the
melody note will not have the maximum active pitch.
instantaneous_non_empty_rest_prob: The instantaneous probability that at
least one note will be active during a melody rest.
instantaneous_missing_pitch_prob: The instantaneous probability that the
melody note will not be active.
Returns:
The instrument number used for the added melody.
Raises:
MelodyInferenceError: If `sequence` is quantized, or if the number of
frames is too large.
"""
if sequences_lib.is_quantized_sequence(sequence):
raise MelodyInferenceError(
'Melody inference on quantized NoteSequence not supported.')
pitches, has_onsets, has_notes, event_times = sequence_note_frames(sequence)
if sequence.notes:
melody_instrument = max(note.instrument for note in sequence.notes) + 1
else:
melody_instrument = 0
if melody_instrument == 9:
# Avoid any confusion around drum channel.
melody_instrument = 10
if not pitches:
# No pitches present in sequence.
return melody_instrument
if len(event_times) + 1 > MAX_NUM_FRAMES:
raise MelodyInferenceError(
'Too many frames for melody inference: %d' % (len(event_times) + 1))
# Compute frame durations (times between consecutive note events).
if event_times:
durations = np.array(
[event_times[0]] +
[t2 - t1 for (t1, t2) in zip(event_times[:-1], event_times[1:])] +
[sequence.total_time - event_times[-1]])
else:
durations = np.array([sequence.total_time])
# Interval distribution is Cauchy-like.
interval_prob_fn = lambda d: 1 / (1 + (d / melody_interval_scale) ** 2)
melody_transition_distribution = _melody_transition_distribution(
rest_prob=rest_prob, interval_prob_fn=interval_prob_fn)
# Remove all pitches absent from sequence from transition matrix; for most
# sequences this will greatly reduce the state space.
num_midi_pitches = constants.MAX_MIDI_PITCH - constants.MIN_MIDI_PITCH + 1
pitch_indices = (
[0] +
[p - constants.MIN_MIDI_PITCH + 1 for p in pitches] +
[num_midi_pitches + p - constants.MIN_MIDI_PITCH + 1 for p in pitches]
)
melody_transition_loglik = np.log(
melody_transition_distribution[pitch_indices, :][:, pitch_indices])
# Compute log-likelihood of each frame under each possibly melody event.
melody_frame_loglik = _melody_frame_log_likelihood(
pitches, has_onsets, has_notes, durations,
instantaneous_non_max_pitch_prob=instantaneous_non_max_pitch_prob,
instantaneous_non_empty_rest_prob=instantaneous_non_empty_rest_prob,
instantaneous_missing_pitch_prob=instantaneous_missing_pitch_prob)
# Compute the most likely sequence of melody events using Viterbi.
melody_events = _melody_viterbi(
pitches, melody_frame_loglik, melody_transition_loglik)
def add_note(start_time, end_time, pitch):
note = sequence.notes.add()
note.start_time = start_time
note.end_time = end_time
note.pitch = pitch
note.velocity = MELODY_VELOCITY
note.instrument = melody_instrument
note_pitch = None
note_start_time = None
for event, time in zip(melody_events, [0.0] + event_times):
if event == REST:
if note_pitch is not None:
# A note has just ended.
add_note(note_start_time, time, note_pitch)
note_pitch = None
else:
pitch, is_onset = event
if is_onset:
# This is a new note onset.
if note_pitch is not None:
add_note(note_start_time, time, note_pitch)
note_pitch = pitch
note_start_time = time
else:
# This is a continuation of the current note.
assert pitch == note_pitch
if note_pitch is not None:
# Add the final note.
add_note(note_start_time, sequence.total_time, note_pitch)
return melody_instrument
Functions
def infer_melody_for_sequence(sequence, melody_interval_scale=2.0, rest_prob=0.1, instantaneous_non_max_pitch_prob=1e-15, instantaneous_non_empty_rest_prob=0.0, instantaneous_missing_pitch_prob=1e-15)
-
Infer melody for a NoteSequence.
This is a work in progress and should not necessarily be expected to return reasonable results. It operates under two main assumptions:
1) Melody onsets always coincide with actual note onsets from the polyphonic NoteSequence. 2) When multiple notes are active, the melody note tends to be the note with the highest pitch.
Args
sequence
- The NoteSequence for which to infer melody. This NoteSequence will be modified in place, with inferred melody notes added as a new instrument.
melody_interval_scale
- The scale parameter for the prior distribution over melody intervals.
rest_prob
- The probability of rest after a melody note.
instantaneous_non_max_pitch_prob
- The instantaneous probability that the melody note will not have the maximum active pitch.
instantaneous_non_empty_rest_prob
- The instantaneous probability that at least one note will be active during a melody rest.
instantaneous_missing_pitch_prob
- The instantaneous probability that the melody note will not be active.
Returns
The instrument number used for the added melody.
Raises
MelodyInferenceError
- If
sequence
is quantized, or if the number of frames is too large.
Expand source code
def infer_melody_for_sequence(sequence, melody_interval_scale=2.0, rest_prob=0.1, instantaneous_non_max_pitch_prob=1e-15, instantaneous_non_empty_rest_prob=0.0, instantaneous_missing_pitch_prob=1e-15): """Infer melody for a NoteSequence. This is a work in progress and should not necessarily be expected to return reasonable results. It operates under two main assumptions: 1) Melody onsets always coincide with actual note onsets from the polyphonic NoteSequence. 2) When multiple notes are active, the melody note tends to be the note with the highest pitch. Args: sequence: The NoteSequence for which to infer melody. This NoteSequence will be modified in place, with inferred melody notes added as a new instrument. melody_interval_scale: The scale parameter for the prior distribution over melody intervals. rest_prob: The probability of rest after a melody note. instantaneous_non_max_pitch_prob: The instantaneous probability that the melody note will not have the maximum active pitch. instantaneous_non_empty_rest_prob: The instantaneous probability that at least one note will be active during a melody rest. instantaneous_missing_pitch_prob: The instantaneous probability that the melody note will not be active. Returns: The instrument number used for the added melody. Raises: MelodyInferenceError: If `sequence` is quantized, or if the number of frames is too large. """ if sequences_lib.is_quantized_sequence(sequence): raise MelodyInferenceError( 'Melody inference on quantized NoteSequence not supported.') pitches, has_onsets, has_notes, event_times = sequence_note_frames(sequence) if sequence.notes: melody_instrument = max(note.instrument for note in sequence.notes) + 1 else: melody_instrument = 0 if melody_instrument == 9: # Avoid any confusion around drum channel. melody_instrument = 10 if not pitches: # No pitches present in sequence. return melody_instrument if len(event_times) + 1 > MAX_NUM_FRAMES: raise MelodyInferenceError( 'Too many frames for melody inference: %d' % (len(event_times) + 1)) # Compute frame durations (times between consecutive note events). if event_times: durations = np.array( [event_times[0]] + [t2 - t1 for (t1, t2) in zip(event_times[:-1], event_times[1:])] + [sequence.total_time - event_times[-1]]) else: durations = np.array([sequence.total_time]) # Interval distribution is Cauchy-like. interval_prob_fn = lambda d: 1 / (1 + (d / melody_interval_scale) ** 2) melody_transition_distribution = _melody_transition_distribution( rest_prob=rest_prob, interval_prob_fn=interval_prob_fn) # Remove all pitches absent from sequence from transition matrix; for most # sequences this will greatly reduce the state space. num_midi_pitches = constants.MAX_MIDI_PITCH - constants.MIN_MIDI_PITCH + 1 pitch_indices = ( [0] + [p - constants.MIN_MIDI_PITCH + 1 for p in pitches] + [num_midi_pitches + p - constants.MIN_MIDI_PITCH + 1 for p in pitches] ) melody_transition_loglik = np.log( melody_transition_distribution[pitch_indices, :][:, pitch_indices]) # Compute log-likelihood of each frame under each possibly melody event. melody_frame_loglik = _melody_frame_log_likelihood( pitches, has_onsets, has_notes, durations, instantaneous_non_max_pitch_prob=instantaneous_non_max_pitch_prob, instantaneous_non_empty_rest_prob=instantaneous_non_empty_rest_prob, instantaneous_missing_pitch_prob=instantaneous_missing_pitch_prob) # Compute the most likely sequence of melody events using Viterbi. melody_events = _melody_viterbi( pitches, melody_frame_loglik, melody_transition_loglik) def add_note(start_time, end_time, pitch): note = sequence.notes.add() note.start_time = start_time note.end_time = end_time note.pitch = pitch note.velocity = MELODY_VELOCITY note.instrument = melody_instrument note_pitch = None note_start_time = None for event, time in zip(melody_events, [0.0] + event_times): if event == REST: if note_pitch is not None: # A note has just ended. add_note(note_start_time, time, note_pitch) note_pitch = None else: pitch, is_onset = event if is_onset: # This is a new note onset. if note_pitch is not None: add_note(note_start_time, time, note_pitch) note_pitch = pitch note_start_time = time else: # This is a continuation of the current note. assert pitch == note_pitch if note_pitch is not None: # Add the final note. add_note(note_start_time, sequence.total_time, note_pitch) return melody_instrument
def sequence_note_frames(sequence)
-
Split a NoteSequence into frame summaries separated by onsets/offsets.
Args
sequence
- The NoteSequence for which to compute frame summaries.
Returns
pitches
- A list of MIDI pitches present in
sequence
, in ascending order. has_onsets
- A Boolean matrix with shape
[num_frames, num_pitches]
where entry (i,j) indicates whether pitch j has a note onset in frame i. has_notes
- A Boolean matrix with shape
[num_frames, num_pitches]
where entry (i,j) indicates whether pitch j is present in frame i, either as an onset or a sustained note. event_times
- A list of length
num_frames - 1
containing the event times separating adjacent frames.
Expand source code
def sequence_note_frames(sequence): """Split a NoteSequence into frame summaries separated by onsets/offsets. Args: sequence: The NoteSequence for which to compute frame summaries. Returns: pitches: A list of MIDI pitches present in `sequence`, in ascending order. has_onsets: A Boolean matrix with shape `[num_frames, num_pitches]` where entry (i,j) indicates whether pitch j has a note onset in frame i. has_notes: A Boolean matrix with shape `[num_frames, num_pitches]` where entry (i,j) indicates whether pitch j is present in frame i, either as an onset or a sustained note. event_times: A list of length `num_frames - 1` containing the event times separating adjacent frames. """ notes = [note for note in sequence.notes if not note.is_drum and note.program not in constants.UNPITCHED_PROGRAMS] onset_times = [note.start_time for note in notes] offset_times = [note.end_time for note in notes] event_times = set(onset_times + offset_times) event_times.discard(0.0) event_times.discard(sequence.total_time) event_times = sorted(event_times) num_frames = len(event_times) + 1 pitches = sorted(set(note.pitch for note in notes)) pitch_map = dict((p, i) for i, p in enumerate(pitches)) num_pitches = len(pitches) has_onsets = np.zeros([num_frames, num_pitches], dtype=bool) has_notes = np.zeros([num_frames, num_pitches], dtype=bool) for note in notes: start_frame = bisect.bisect_right(event_times, note.start_time) end_frame = bisect.bisect_left(event_times, note.end_time) has_onsets[start_frame, pitch_map[note.pitch]] = True has_notes[start_frame:end_frame+1, pitch_map[note.pitch]] = True return pitches, has_onsets, has_notes, event_times
Classes
class MelodyInferenceError (*args, **kwargs)
-
Common base class for all non-exit exceptions.
Expand source code
class MelodyInferenceError(Exception): # pylint:disable=g-bad-exception-name pass
Ancestors
- builtins.Exception
- builtins.BaseException