360 lines
10 KiB
Python
360 lines
10 KiB
Python
from __future__ import annotations
|
|
|
|
import typing as t
|
|
from typing import Generic, final, overload
|
|
|
|
from narwhals.stable.v1.typing import IntoDataFrameT
|
|
|
|
from altair.datasets import _reader
|
|
from altair.datasets._reader import IntoFrameT
|
|
|
|
if t.TYPE_CHECKING:
|
|
import sys
|
|
from typing import Any, Literal
|
|
|
|
import pandas as pd
|
|
import polars as pl
|
|
import pyarrow as pa
|
|
|
|
from altair.datasets._cache import DatasetCache
|
|
from altair.datasets._reader import Reader
|
|
|
|
if sys.version_info >= (3, 11):
|
|
from typing import LiteralString, Self
|
|
else:
|
|
from typing_extensions import LiteralString, Self
|
|
from altair.datasets._reader import _Backend
|
|
from altair.datasets._typing import Dataset, Extension
|
|
|
|
|
|
__all__ = ["Loader", "load"]
|
|
|
|
|
|
class Loader(Generic[IntoDataFrameT, IntoFrameT]):
|
|
"""
|
|
Load example datasets *remotely* from `vega-datasets`_, with caching.
|
|
|
|
A new ``Loader`` must be initialized by specifying a backend::
|
|
|
|
from altair.datasets import Loader
|
|
|
|
load = Loader.from_backend("polars")
|
|
load
|
|
Loader[polars]
|
|
|
|
.. _vega-datasets:
|
|
https://github.com/vega/vega-datasets
|
|
"""
|
|
|
|
_reader: Reader[IntoDataFrameT, IntoFrameT]
|
|
|
|
@overload
|
|
@classmethod
|
|
def from_backend(
|
|
cls, backend_name: Literal["polars"] = ..., /
|
|
) -> Loader[pl.DataFrame, pl.LazyFrame]: ...
|
|
|
|
@overload
|
|
@classmethod
|
|
def from_backend(
|
|
cls, backend_name: Literal["pandas", "pandas[pyarrow]"], /
|
|
) -> Loader[pd.DataFrame, pd.DataFrame]: ...
|
|
|
|
@overload
|
|
@classmethod
|
|
def from_backend(
|
|
cls, backend_name: Literal["pyarrow"], /
|
|
) -> Loader[pa.Table, pa.Table]: ...
|
|
|
|
@classmethod
|
|
def from_backend(
|
|
cls: type[Loader[Any, Any]], backend_name: _Backend = "polars", /
|
|
) -> Loader[Any, Any]:
|
|
"""
|
|
Initialize a new loader, with the specified backend.
|
|
|
|
Parameters
|
|
----------
|
|
backend_name
|
|
DataFrame package/config used to return data.
|
|
|
|
* *polars*: Using `polars defaults`_
|
|
* *pandas*: Using `pandas defaults`_.
|
|
* *pandas[pyarrow]*: Using ``dtype_backend="pyarrow"``
|
|
* *pyarrow*: (*Experimental*)
|
|
|
|
.. warning::
|
|
Most datasets use a `JSON format not supported`_ by ``pyarrow``
|
|
|
|
Examples
|
|
--------
|
|
Using ``polars``::
|
|
|
|
from altair.datasets import Loader
|
|
|
|
load = Loader.from_backend("polars")
|
|
cars = load("cars")
|
|
|
|
type(cars)
|
|
polars.dataframe.frame.DataFrame
|
|
|
|
Using ``pandas``::
|
|
|
|
load = Loader.from_backend("pandas")
|
|
cars = load("cars")
|
|
|
|
type(cars)
|
|
pandas.core.frame.DataFrame
|
|
|
|
Using ``pandas``, backed by ``pyarrow`` dtypes::
|
|
|
|
load = Loader.from_backend("pandas[pyarrow]")
|
|
co2 = load("co2")
|
|
|
|
type(co2)
|
|
pandas.core.frame.DataFrame
|
|
|
|
co2.dtypes
|
|
Date datetime64[ns]
|
|
CO2 double[pyarrow]
|
|
adjusted CO2 double[pyarrow]
|
|
dtype: object
|
|
|
|
.. _polars defaults:
|
|
https://docs.pola.rs/api/python/stable/reference/io.html
|
|
.. _pandas defaults:
|
|
https://pandas.pydata.org/docs/reference/io.html
|
|
.. _JSON format not supported:
|
|
https://arrow.apache.org/docs/python/json.html#reading-json-files
|
|
"""
|
|
return cls.from_reader(_reader._from_backend(backend_name))
|
|
|
|
@classmethod
|
|
def from_reader(cls, reader: Reader[IntoDataFrameT, IntoFrameT], /) -> Self:
|
|
obj = cls.__new__(cls)
|
|
obj._reader = reader
|
|
return obj
|
|
|
|
def __call__(
|
|
self,
|
|
name: Dataset | LiteralString,
|
|
suffix: Extension | None = None,
|
|
/,
|
|
**kwds: Any,
|
|
) -> IntoDataFrameT:
|
|
"""
|
|
Get a remote dataset and load as tabular data.
|
|
|
|
Parameters
|
|
----------
|
|
name
|
|
Name of the dataset/`Path.stem`_.
|
|
suffix
|
|
File extension/`Path.suffix`_.
|
|
|
|
.. note::
|
|
Only needed if ``name`` is available in multiple formats.
|
|
**kwds
|
|
Arguments passed to the underlying read function.
|
|
|
|
Examples
|
|
--------
|
|
Using ``polars``::
|
|
|
|
from altair.datasets import Loader
|
|
|
|
load = Loader.from_backend("polars")
|
|
source = load("iowa_electricity")
|
|
|
|
source.columns
|
|
['year', 'source', 'net_generation']
|
|
|
|
source.head(5)
|
|
shape: (5, 3)
|
|
┌────────────┬──────────────┬────────────────┐
|
|
│ year ┆ source ┆ net_generation │
|
|
│ --- ┆ --- ┆ --- │
|
|
│ date ┆ str ┆ i64 │
|
|
╞════════════╪══════════════╪════════════════╡
|
|
│ 2001-01-01 ┆ Fossil Fuels ┆ 35361 │
|
|
│ 2002-01-01 ┆ Fossil Fuels ┆ 35991 │
|
|
│ 2003-01-01 ┆ Fossil Fuels ┆ 36234 │
|
|
│ 2004-01-01 ┆ Fossil Fuels ┆ 36205 │
|
|
│ 2005-01-01 ┆ Fossil Fuels ┆ 36883 │
|
|
└────────────┴──────────────┴────────────────┘
|
|
|
|
Using ``pandas``::
|
|
|
|
load = Loader.from_backend("pandas")
|
|
source = load("iowa_electricity")
|
|
|
|
source.columns
|
|
Index(['year', 'source', 'net_generation'], dtype='object')
|
|
|
|
source.head(5)
|
|
year source net_generation
|
|
0 2001-01-01 Fossil Fuels 35361
|
|
1 2002-01-01 Fossil Fuels 35991
|
|
2 2003-01-01 Fossil Fuels 36234
|
|
3 2004-01-01 Fossil Fuels 36205
|
|
4 2005-01-01 Fossil Fuels 36883
|
|
|
|
Using ``pyarrow``::
|
|
|
|
load = Loader.from_backend("pyarrow")
|
|
source = load("iowa_electricity")
|
|
|
|
source.column_names
|
|
['year', 'source', 'net_generation']
|
|
|
|
source.slice(0, 5)
|
|
pyarrow.Table
|
|
year: date32[day]
|
|
source: string
|
|
net_generation: int64
|
|
----
|
|
year: [[2001-01-01,2002-01-01,2003-01-01,2004-01-01,2005-01-01]]
|
|
source: [["Fossil Fuels","Fossil Fuels","Fossil Fuels","Fossil Fuels","Fossil Fuels"]]
|
|
net_generation: [[35361,35991,36234,36205,36883]]
|
|
|
|
.. _Path.stem:
|
|
https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.stem
|
|
.. _Path.suffix:
|
|
https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.suffix
|
|
"""
|
|
return self._reader.dataset(name, suffix, **kwds)
|
|
|
|
def url(
|
|
self,
|
|
name: Dataset | LiteralString,
|
|
suffix: Extension | None = None,
|
|
/,
|
|
) -> str:
|
|
"""
|
|
Return the address of a remote dataset.
|
|
|
|
Parameters
|
|
----------
|
|
name
|
|
Name of the dataset/`Path.stem`_.
|
|
suffix
|
|
File extension/`Path.suffix`_.
|
|
|
|
.. note::
|
|
Only needed if ``name`` is available in multiple formats.
|
|
|
|
.. _Path.stem:
|
|
https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.stem
|
|
.. _Path.suffix:
|
|
https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.suffix
|
|
|
|
Examples
|
|
--------
|
|
The returned url will always point to an accessible dataset::
|
|
|
|
import altair as alt
|
|
from altair.datasets import Loader
|
|
|
|
load = Loader.from_backend("polars")
|
|
load.url("cars")
|
|
"https://cdn.jsdelivr.net/npm/vega-datasets@v2.11.0/data/cars.json"
|
|
|
|
We can pass the result directly to a chart::
|
|
|
|
url = load.url("cars")
|
|
alt.Chart(url).mark_point().encode(x="Horsepower:Q", y="Miles_per_Gallon:Q")
|
|
"""
|
|
return self._reader.url(name, suffix)
|
|
|
|
@property
|
|
def cache(self) -> DatasetCache:
|
|
"""
|
|
Caching of remote dataset requests.
|
|
|
|
Configure cache path::
|
|
|
|
self.cache.path = "..."
|
|
|
|
Download the latest datasets *ahead-of-time*::
|
|
|
|
self.cache.download_all()
|
|
|
|
Remove all downloaded datasets::
|
|
|
|
self.cache.clear()
|
|
|
|
Disable caching::
|
|
|
|
self.cache.path = None
|
|
"""
|
|
return self._reader.cache
|
|
|
|
def __repr__(self) -> str:
|
|
return f"{type(self).__name__}[{self._reader._name}]"
|
|
|
|
|
|
@final
|
|
class _Load(Loader[IntoDataFrameT, IntoFrameT]):
|
|
@overload
|
|
def __call__( # pyright: ignore[reportOverlappingOverload]
|
|
self,
|
|
name: Dataset | LiteralString,
|
|
suffix: Extension | None = ...,
|
|
/,
|
|
backend: None = ...,
|
|
**kwds: Any,
|
|
) -> IntoDataFrameT: ...
|
|
@overload
|
|
def __call__(
|
|
self,
|
|
name: Dataset | LiteralString,
|
|
suffix: Extension | None = ...,
|
|
/,
|
|
backend: Literal["polars"] = ...,
|
|
**kwds: Any,
|
|
) -> pl.DataFrame: ...
|
|
@overload
|
|
def __call__(
|
|
self,
|
|
name: Dataset | LiteralString,
|
|
suffix: Extension | None = ...,
|
|
/,
|
|
backend: Literal["pandas", "pandas[pyarrow]"] = ...,
|
|
**kwds: Any,
|
|
) -> pd.DataFrame: ...
|
|
@overload
|
|
def __call__(
|
|
self,
|
|
name: Dataset | LiteralString,
|
|
suffix: Extension | None = ...,
|
|
/,
|
|
backend: Literal["pyarrow"] = ...,
|
|
**kwds: Any,
|
|
) -> pa.Table: ...
|
|
def __call__(
|
|
self,
|
|
name: Dataset | LiteralString,
|
|
suffix: Extension | None = None,
|
|
/,
|
|
backend: _Backend | None = None,
|
|
**kwds: Any,
|
|
) -> IntoDataFrameT | pl.DataFrame | pd.DataFrame | pa.Table:
|
|
if backend is None:
|
|
return super().__call__(name, suffix, **kwds)
|
|
else:
|
|
return self.from_backend(backend)(name, suffix, **kwds)
|
|
|
|
|
|
load: _Load[Any, Any]
|
|
|
|
|
|
def __getattr__(name):
|
|
if name == "load":
|
|
reader = _reader.infer_backend()
|
|
global load
|
|
load = _Load.from_reader(reader)
|
|
return load
|
|
else:
|
|
msg = f"module {__name__!r} has no attribute {name!r}"
|
|
raise AttributeError(msg)
|