Source code for ergo.platforms.metaculus.question.continuous

from collections import namedtuple
from datetime import datetime
import textwrap
from typing import Dict, List, Optional, Union

import jax.numpy as np
import numpy as onp
import pandas as pd
from plotnine import (
    aes,
    geom_density,
    ggplot,
    ggtitle,
    labs,
    scale_fill_brewer,
    scale_x_continuous,
)
import requests

from ergo import ppl
import ergo.distributions as dist
from ergo.scale import Scale
from ergo.theme import ergo_theme
from ergo.utils import memoized_method

from .constants import (
    max_loc,
    max_open_high,
    max_open_low,
    max_scale,
    min_open_high,
    min_open_low,
    min_scale,
)
from .question import MetaculusQuestion
from .types import ArrayLikes

Bounds = namedtuple("Bounds", ["floor", "ceiling"])


[docs]class ContinuousQuestion(MetaculusQuestion): """ A continuous Metaculus question -- a question of the form, what's your distribution on this event? """ def side_open(self, side) -> bool: try: return self.possibilities[side] == "tail" # The Metaculus API is inconsistent about how open sides are expressed. # Many questions express it explicitly via possibilities[side], # e.g. https://www.metaculus.com/questions/3992. # Other questions (I think only older ones) # do not seem to explicitly state whether the sides are open, # e.g. https://www.metaculus.com/questions/605. # My current guess (reflected in the code below) # is that both sides are always closed # on questions where possibilities[side] is missing. except KeyError: return False @property def low_open(self) -> bool: """ Are you allowed to place probability mass below the bottom of this question's range? """ return self.side_open("low") @property def high_open(self) -> bool: """ Are you allowed to place probability mass above the top of this question's range? """ return self.side_open("high") @property def p_above(self) -> Optional[float]: if self.latest_community_percentiles is None: return None return 1 - self.latest_community_percentiles["high"] @property def p_below(self) -> Optional[float]: if self.latest_community_percentiles is None: return None return self.latest_community_percentiles["low"] @property def p_outside(self) -> Optional[float]: """ How much probability mass is outside this question's range? """ if self.p_below is None or self.p_above is None: return None return self.p_below + self.p_above @property def has_predictions(self): """ Are there any predictions for the question yet? """ return hasattr(self, "prediction_histogram") @property def question_range(self): """ Range of answers specified when the question was created """ return self.possibilities["scale"] @property def question_range_width(self): return self.question_range["max"] - self.question_range["min"] def _scale_x(self, xmin: float = None, xmax: float = None): return scale_x_continuous(limits=(xmin, xmax)) @property def plot_title(self): return "\n".join(textwrap.wrap(self.name or self.data["title"], 60)) # type: ignore @property def latest_community_percentiles(self): """ :return: Some percentiles for the metaculus commununity's latest rough prediction. `prediction_histogram` returns a more fine-grained histogram of the community prediction """ if len(self.prediction_timeseries) == 0: return None return self.prediction_timeseries[-1]["community_prediction"]
[docs] def prepare_logistic(self, normalized_dist: dist.Logistic) -> dist.Logistic: """ Transform a single logistic distribution by clipping the parameters and adding scale information as needed for submission to Metaculus. The loc and scale have to be within a certain range for the Metaculus API to accept the prediction. :param dist: a (normalized) logistic distribution :return: a transformed logistic distribution """ if hasattr(normalized_dist, "base_dist"): normalized_dist = normalized_dist.base_dist # type: ignore if normalized_dist.s <= 0: raise ValueError("logistic_params.scale must be greater than 0") clipped_loc = min(normalized_dist.loc, max_loc) clipped_scale = float(onp.clip(normalized_dist.s, min_scale, max_scale)) # type: ignore if self.low_open: low = float(onp.clip(normalized_dist.cdf(0), min_open_low, max_open_low,)) else: low = 0 if self.high_open: high = float( onp.clip(normalized_dist.cdf(1), min_open_high + low, max_open_high,) ) else: high = 1 return dist.Logistic( clipped_loc, clipped_scale, Scale(0, 1), {"low": low, "high": high} )
[docs] def prepare_logistic_mixture( self, normalized_dist: dist.LogisticMixture ) -> dist.LogisticMixture: """ Transform a (normalized) logistic mixture distribution as needed for submission to Metaculus. :param normalized_dist: normalized mixture dist :return: normalized dist clipped and formatted for the API """ transformed_components = [ self.prepare_logistic(c) for c in normalized_dist.components ] transformed_probs = onp.clip(normalized_dist.probs, 0.01, 0.99) # type: ignore return dist.LogisticMixture(transformed_components, transformed_probs) # type: ignore
def community_pairs(self, normalized=False, denorm_xs_only=False): if normalized: return [ {"x": float(v[0]), "density": v[2]} for v in self.prediction_histogram ] elif denorm_xs_only: return [ {"x": self.scale.denormalize_point(float(v[0])), "density": v[2]} for v in self.prediction_histogram ] else: return [ { "x": self.scale.denormalize_point(float(v[0])), "density": self.scale.denormalize_density(float(v[2])), } for v in self.prediction_histogram ]
[docs] def community_dist(self) -> dist.PointDensity: """ Get the community distribution for this question NB: currently missing the part of the distribtion outside the question range :return: the (true-scale) community distribution as a histogram. """ # TODO (#306): Unify distributions interface # TODO (#307): Account for values out of range in # ContinuousQuestion.community_dist() histogram = self.community_pairs(normalized=True) return dist.PointDensity.from_pairs(histogram, self.scale, normalized=True)
def community_conditions(self, crossentropy_weight=0.1, interval_weight=10000.0): from ergo.conditions import ( CrossEntropyCondition, IntervalCondition, Condition, ) pairs = self.community_pairs(normalized=True) # Note that this histogram is normalized - it sums to 1 even if the pairs don't! point_density_dist = dist.PointDensity.from_pairs( pairs, scale=self.scale, normalized=True ) condition = CrossEntropyCondition( point_density_dist, weight=crossentropy_weight ) community_conditions: List[Condition] = [condition] if self.low_open: community_conditions.append( IntervalCondition( p=self.p_below, max=self.scale.low, weight=interval_weight ) ) if self.high_open: community_conditions.append( IntervalCondition( p=self.p_above, min=self.scale.high, weight=interval_weight ) ) return community_conditions
[docs] @memoized_method(None) def community_dist_in_range(self): """ A distribution for the portion of the current normalized community prediction that's within the question's range, i.e. 0...(len(self.prediction_histogram)-1). :return: distribution on integers """ y2 = [p[2] for p in self.prediction_histogram] return dist.Categorical(np.array(y2))
[docs] def sample_normalized_community(self) -> float: """ Sample an approximation of the entire current community prediction, on the normalized scale. The main reason that it's just an approximation is that we don't know exactly where probability mass outside of the question range should be, so we place it arbitrarily. :return: One sample on the normalized scale """ # FIXME: Samples below/above range are pretty arbitrary sample_below_range = -dist.halfnormal(0.1) sample_above_range = 1 + dist.halfnormal(0.1) sample_in_range = ppl.sample(self.community_dist_in_range()) / float( len(self.prediction_histogram) ) p_below = self.latest_community_percentiles["low"] p_above = 1 - self.latest_community_percentiles["high"] p_in_range = 1 - p_below - p_above return float( dist.random_choice( [sample_below_range, sample_in_range, sample_above_range], ps=[p_below, p_in_range, p_above], ) )
[docs] def sample_community(self) -> float: """ Sample an approximation of the entire current community prediction, on the true scale of the question. The main reason that it's just an approximation is that we don't know exactly where probability mass outside of the question range should be, so we place it arbitrarily :return: One sample on the true scale """ if not self.has_predictions: raise ValueError("There are currently no predictions for this question") normalized_sample = self.sample_normalized_community() sample = np.array(self.scale.denormalize_points([normalized_sample])) if self.name: ppl.tag(sample, self.name) return float(sample)
def get_submission_from_samples( self, samples: Union[pd.Series, np.ndarray], verbose=False ) -> dist.LogisticMixture: if not type(samples) in ArrayLikes: raise TypeError("Please submit a vector of samples") normalized_samples = self.scale.normalize_points(samples) _dist = dist.LogisticMixture.from_samples( normalized_samples, fixed_params={"num_components": 3}, verbose=verbose, scale=Scale(0, 1), ) return self.prepare_logistic_mixture(_dist) @staticmethod def format_logistic_for_api(submission: dist.Logistic, weight: float) -> dict: if submission.scale is None: raise ValueError("Submission distribution needs a scale") # Convert all the numbers to floats here so that you can be sure that # wherever they originated (e.g. numpy), they'll be regular old floats that # can be converted to json by json.dumps. return { "kind": "logistic", "x0": float(submission.loc), "s": float(submission.s), "w": float(weight), "low": float(submission.metadata["low"]), "high": float(submission.metadata["high"]), } def submit(self, submission: dist.LogisticMixture) -> requests.Response: prediction_data = { "prediction": { "kind": "multi", "d": sorted( [ self.format_logistic_for_api(c, submission.probs[i]) for i, c in enumerate(submission.components) ], key=lambda l: -l["w"], ), }, "void": False, } r = self.metaculus.predict(self.id, prediction_data,) self.refresh_question() return r
[docs] def submit_from_samples(self, samples, verbose=False) -> requests.Response: """ Submit prediction to Metaculus based on samples from a prediction distribution :param samples: Samples from a distribution answering the prediction question :return: logistic mixture params clipped and formatted to submit to Metaculus """ submission = self.get_submission_from_samples(samples, verbose=verbose) return self.submit(submission)
def get_bounds(self): # Return true-scale bounds floor, ceiling = None, None possibilities = self.possibilities if possibilities.get("low") != "tail": floor = float(self.scale.low) if possibilities.get("high") != "tail": ceiling = float(self.scale.high) return Bounds(floor=floor, ceiling=ceiling) def get_logistic_from_json(self, logistic_json: Dict) -> dist.Logistic: bounds = self.get_bounds() normed_bounds = {} if bounds.floor is not None: normed_bounds["floor"] = self.scale.normalize_point(bounds.floor) if bounds.ceiling is not None: normed_bounds["ceiling"] = self.scale.normalize_point(bounds.ceiling) return dist.Truncate( # type: ignore dist.Logistic(logistic_json["x0"], logistic_json["s"], normalized=True), **normed_bounds, ) def get_submission_from_json(self, submission_json: Dict) -> dist.LogisticMixture: components = [ self.get_logistic_from_json(logistic_json) for logistic_json in submission_json ] probs = [logistic_json["w"] for logistic_json in submission_json] return dist.LogisticMixture(components, probs) def get_latest_normalized_prediction(self) -> dist.LogisticMixture: latest_prediction = self.my_predictions["predictions"][-1]["d"] return self.get_submission_from_json(latest_prediction)
[docs] def show_prediction( self, samples, plot_samples: bool = True, plot_fitted: bool = False, percent_kept: float = 0.95, side_cut_from: str = "both", show_community: bool = False, num_samples: int = 1000, **kwargs, ): """ Plot prediction on the true question scale from samples or a submission object. Optionally compare prediction against a sample from the distribution of community predictions :param samples: samples from a distribution answering the prediction question (true scale). Can either be a 1-d array corresponding to one model's predictions, or a pandas DataFrame with each column corresponding to a distinct model's predictions :param plot_samples: boolean indicating whether to plot the raw samples :param plot_fitted: boolean indicating whether to compute Logistic Mixture Params from samples and plot the resulting fitted distribution. Note this is currently only supported for 1-d samples :param percent_kept: percentage of sample distrubtion to keep :param side_cut_from: which side to cut tails from, either 'both','lower', or 'upper' :param show_community: boolean indicating whether comparison to community predictions should be made :param num_samples: number of samples from the community :param kwargs: additional plotting parameters """ df = pd.DataFrame() if not plot_fitted and not plot_samples: raise ValueError( "Nothing to plot. Niether plot_fitted nor plot_samples was True" ) if plot_samples: if isinstance(samples, list): samples = pd.Series(samples) if not type(samples) in ArrayLikes: raise ValueError( "Samples should be a list, numpy array or pandas series" ) num_samples = samples.shape[0] if type(samples) == pd.DataFrame: if plot_fitted and samples.shape[1] > 1: raise ValueError( "For multiple predictions comparisons, only samples can be compared (plot_fitted must be False)" ) for col in samples: # use numpy array to ensure df doesn't become read-only df[col] = onp.array(self.scale.normalize_points(samples[col])) else: # use numpy array to ensure df doesn't become read-only df["samples"] = onp.array(self.scale.normalize_points(samples)) if plot_fitted: prediction = self.get_submission_from_samples(samples) df["fitted"] = pd.Series( [prediction.sample() for _ in range(0, num_samples)] ) if show_community: df["community"] = [ # type: ignore self.sample_normalized_community() for _ in range(0, num_samples) ] # get domain for graph given the percentage of distribution kept xmin, xmax = self.scale.denormalize_points( self.get_central_quantiles( df, percent_kept=percent_kept, side_cut_from=side_cut_from, ) ) for col in df: df[col] = self.scale.denormalize_points(df[col]) df = pd.melt(df, var_name="sources", value_name="samples") # type: ignore plot = self.comparison_plot(df, xmin, xmax, **kwargs) + labs( x="Prediction", y="Density", title=self.plot_title + "\n\nPrediction vs Community" if show_community else self.plot_title, ) try: plot.draw() # type: ignore except RuntimeError as err: print(err) print( "The plot was unable to automatically determine a bandwidth. You can manually specify one with the keyword 'bw', e.g., show_prediction(..., bw=.1)" )
[docs] def show_community_prediction( self, percent_kept: float = 0.95, side_cut_from: str = "both", num_samples: int = 1000, **kwargs, ): """ Plot samples from the community prediction on this question :param percent_kept: percentage of sample distrubtion to keep :param side_cut_from: which side to cut tails from, either 'both','lower', or 'upper' :param num_samples: number of samples from the community :param kwargs: additional plotting parameters """ community_samples = pd.Series( [self.sample_normalized_community() for _ in range(0, num_samples)] ) _xmin, _xmax = self.scale.denormalize_points( self.get_central_quantiles( community_samples, percent_kept=percent_kept, side_cut_from=side_cut_from, ) ) df = pd.DataFrame( data={"samples": self.scale.denormalize_points(community_samples)} ) plot = self.density_plot(df, _xmin, _xmax, **kwargs) + labs( x="Prediction", y="Density", title=self.plot_title + "\n\nCommunity Predictions", ) try: plot.draw() # type: ignore except RuntimeError as err: print(err) print( "The plot was unable to automatically determine a bandwidth. You can manually specify one with the keyword 'bw', e.g., show_prediction(..., bw=.1)" )
def comparison_plot( self, df: pd.DataFrame, xmin=None, xmax=None, bw="normal_reference", **kwargs ): return ( ggplot(df, aes(df.columns[1], fill=df.columns[0])) + scale_fill_brewer(type="qual", palette="Pastel1") + geom_density(bw=bw, alpha=0.8) + ggtitle(self.plot_title) + self._scale_x(xmin, xmax) + ergo_theme ) def density_plot( self, df: pd.DataFrame, xmin=None, xmax=None, fill: str = "#fbb4ae", bw="normal_reference", **kwargs, ): return ( ggplot(df, aes(df.columns[0])) + geom_density(fill=fill, alpha=0.8) + ggtitle(self.plot_title) + self._scale_x(xmin, xmax) + ergo_theme )
[docs] def change_since(self, since: datetime): """ Calculate change in community prediction median between the argument and most recent prediction :param since: datetime :return: change in median community prediction since datetime """ try: old = self.get_community_prediction(before=since) new = self.get_community_prediction() except LookupError: return 0 return new["q2"] - old["q2"]
[docs] def normalize_samples(self, samples): """ Map samples from their true scale to the Metaculus normalized scale :param samples: samples from a distribution answering the prediction question (true scale) :return: samples on the normalized scale """ return self.scale.normalize_points(samples)
[docs] def denormalize_samples(self, samples): """ Map samples from the Metaculus normalized scale to the true scale :param samples: samples on the normalized scale :return: samples from a distribution answering the prediction question (true scale) """ return self.scale.denormalize_points(samples)