« home

Matbench Perovskite Dataset

Exploratory Data Analysis (EDA). MPContribs link

# matminer needed for loading data
!pip install pymatviz matminer
import pandas as pd
import plotly.express as px
import plotly.io as pio

# see https://github.com/CompRhys/aviary/blob/main/aviary/wren/utils.py
from aviary.wren.utils import get_protostructure_label_from_spglib
from matminer.datasets import load_dataset
from tqdm import tqdm

import pymatviz as pmv
from pymatviz import ptable_heatmap_plotly, spacegroup_sunburst
from pymatviz.enums import Key


pmv.set_plotly_template("pymatviz_white")


__author__ = "Janosh Riebesell"
__date__ = "2022-03-19"


# Interactive plotly figures don't show up on GitHub.
# https://github.com/plotly/plotly.py/issues/931
# change renderer from "svg" to "notebook" to get hover tooltips back
# (but blank plots on GitHub!)
pio.renderers.default = "png"
df_perov = load_dataset("matbench_perovskites")

df_perov[[Key.spg_symbol, Key.spg_num]] = [
    struct.get_space_group_info() for struct in tqdm(df_perov[Key.structure])
]
df_perov[Key.volume] = df_perov[Key.structure].map(lambda struct: struct.volume)

df_perov[Key.formula] = df_perov[Key.structure].map(lambda cryst: cryst.formula)

df_perov[Key.crystal_system] = df_perov[Key.spg_num].map(
    pmv.utils.crystal_sys_from_spg_num
)
fig, axs = pmv.structure_2d(df_perov[Key.structure].iloc[:12])
No description has been provided for this image
labels = {"e_form": "Formation Energy (eV/atom)"}

fig = px.histogram(df_perov, x="e_form", nbins=300, labels=labels)

fig.layout.title = "Matbench Perovskites Formation Energy Distribution"

fig.add_vline(x=0, fillcolor="black", line=dict(width=2, dash="dot"))
No description has been provided for this image
fig = ptable_heatmap_plotly(
    df_perov[Key.formula], exclude_elements=["O"], heat_mode="percent"
)
fig.update_layout(
    title=dict(text="<b>Elements in Matbench Perovskites dataset</b>", x=0.36, y=0.9)
)
No description has been provided for this image
ax = df_perov[Key.crystal_system].value_counts().plot.bar(rot="horizontal")

ax.set(title="Crystal systems in Matbench Perovskites")

pmv.powerups.annotate_bars(ax, v_offset=250)
No description has been provided for this image
labels = {
    "spglib_spg_num": "Spacegroup",
    "e_form": "Formation Energy",
    "volume": "Volume",
}

fig = px.scatter(
    df_perov, x="volume", y="e_form", color="spglib_spg_num", labels=labels
)
fig.layout.title = dict(text="Matbench Perovskites Formation Energy vs. Volume", x=0.5)
No description has been provided for this image
fig = spacegroup_sunburst(df_perov.spglib_spg_num, show_counts="percent")
fig.layout.title = dict(text="Matbench Perovskites spacegroup sunburst", x=0.5)
No description has been provided for this image
# takes ~6h (when running uninterrupted)
for idx, struct in tqdm(df_perov[Key.structure].items(), total=len(df_perov)):
    if pd.isna(df_perov.aflow_wyckoff[idx]):
        df_perov.loc[idx, "aflow_wyckoff"] = get_protostructure_label_from_spglib(
            struct,
            "/path/to/aflow",  # defaults to aflow
        )
df_perov["aflow_wyckoff"] = pd.read_csv(
    # 2022-05-17-matbench_perovskites_aflow_labels.csv
    "https://docs.google.com/spreadsheets/d/"
    "1Mhk5t3Ac_aHOTWMjZ1DL4LtUBIB21nWt7oy2t3M-fQU/export?format=csv"
).aflow_wyckoff
# uncomment line to cache expensive aflow results
# %store df_perov

# uncomment line to reload cached dataframe
%store -r df_perov

extract spacegroups and crystal systems from Aflow Wyckoff labels

df_perov["aflow_spg_num"] = df_perov.aflow_wyckoff.str.split("_").str[2].astype(int)
df_perov["aflow_crys_sys"] = df_perov.aflow_spg_num.map(
    pmv.utils.crystal_sys_from_spg_num
)

Surprisingly large disparity between Spglib and Aflow spacegroups

Spglib is fast while Aflow uses a slower adaptive but presumably more correct algorithm.

fig = pmv.sankey_from_2_df_cols(df_perov, ["spglib_spg_num", "aflow_spg_num"])

title = "Spglib vs Aflow Spacegroups<br>for the Matbench Perovskites dataset"

fig.layout.title = dict(text=title, x=0.5)

pmv.io.save_and_compress_svg(fig, "sankey-spglib-vs-aflow-spacegroups")

fig.show()
sankey-spglib-vs-aflow-spacegroups.svg:
Done in 79 ms!
10.74 KiB - 37.1% = 6.752 KiB
No description has been provided for this image
fig = pmv.sankey_from_2_df_cols(df_perov, ["spglib_crys_sys", "aflow_crys_sys"])

title = "Spglib vs Aflow Crystal systems<br>for the Matbench Perovskites dataset"
fig.layout.title = dict(text=title, x=0.5)
No description has been provided for this image