« home

Materials Project Formation Energy Distribution

MP has a curious bimodality in its formation energies. Considering the formation energies are the result of a carefully fitted correction scheme (see pymatgen.entries.compatibility.MaterialsProject2020Compatibility) that depends mainly on the composition of a compound, let's look at which elements dominate the upper and lower modes.

!pip install pymatviz
import pandas as pd
import plotly.express as px
import plotly.io as pio
from pymatgen.ext.matproj import MPRester

from pymatviz import count_elements, ptable_heatmap_plotly
from pymatviz.enums import Key


__author__ = "Janosh Riebesell"
__date__ = "2022-08-11"

pio.renderers.default = "png"

Check if all of MP has a bi-modal formation energy distribution. Short answer: yes it does.

MP_API_KEY = "your Materials Project API key"
e_form_col = "formation_energy_per_atom"
fields = [Key.mat_id, Key.formula_pretty, e_form_col, "energy_type"]

e_form_all_mp = MPRester(use_document_model=False).thermo.search(
    fields=fields,
    thermo_types=None,
    # thermo_types=["GGA_GGA+U_R2SCAN"],
    chunk_size=200,
)
df_e_form = pd.DataFrame(e_form_all_mp).set_index(Key.mat_id)
df_e_form = df_e_form.rename(columns={"formula_pretty": Key.formula})
df_e_form.head(10)
formula formation_energy_per_atom energy_type
material_id
mp-23155 Ar 0.000000 R2SCAN
mp-1094136 Ni 0.739852 GGA
mp-568714 Bi 1.117044 GGA
mp-1067758 Bi 0.036314 GGA
mp-10207 Np 0.447494 GGA
mp-1183258 AcTh3 0.132877 GGA
mp-11343 Th 0.151527 R2SCAN
mp-862690 Ac 0.000000 R2SCAN
mp-1181019 Mo 0.331963 GGA
mp-1014111 Ni 0.620239 GGA
# cache MP data
# %store df_e_form

# load cached MP data
# %store -r df_e_form
df_e_form.energy_type.value_counts()
GGA       80585
GGA+U     40970
R2SCAN    33062
Name: energy_type, dtype: int64
df_e_form_gga_only = df_e_form.query("energy_type in ('GGA', 'GGA+U')")
expected = 121_555
if len(df_e_form_gga_only) != expected:
    print(f"{expected=} GGA & GGA+U entries, got {len(df_e_form_gga_only)}")
for energy_type, df_e_type in [
    *df_e_form.groupby("energy_type"),
    ("GGA & GGA+U", df_e_form_gga_only),
]:
    fig = ptable_heatmap_plotly(df_e_type[Key.formula])
    title = f"{len(df_e_type):,} {energy_type} entries"
    fig.layout.title = dict(text=title, x=0.4, y=0.94, font=dict(size=20))
    fig.show()
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
labels = {
    "formation_energy_per_atom": "Formation Energy [eV/atom]",
    "count": "Number of Entries",
}
for energy_type, df_e_type in [
    *df_e_form.groupby("energy_type"),
    ["all together", df_e_form],
    ["GGA & GGA+U (no r2SCAN)", df_e_form_gga_only],
]:
    fig = px.histogram(
        df_e_type.query(f"-5 < {e_form_col} < 5"),
        x=e_form_col,
        nbins=150,
        range_x=(-5, 3),
        labels=labels,
    )
    title = f"{energy_type}: {len(df_e_type):,} entries"
    fig.layout.title = dict(text=title, x=0.5)

    # add vertical line at valley of GGA & GGA+U bimodal distribution
    if energy_type == "GGA & GGA+U (no r2SCAN)":
        e_form_valley = -1.35
        anno = dict(
            text=f"{e_form_valley} eV/atom",
            font=dict(size=14, color="orange"),
            xshift=10,
        )
        fig.add_vline(
            e_form_valley, line=dict(color="orange", dash="dash"), annotation=anno
        )

    fig.show()
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

Plot element heatmap for GGA and GGA+U entries above and below e_form_valley.

for comparator, label in ((">", "higher"), ("<=", "lower")):
    df_query = df_e_form_gga_only.query(f"{e_form_col} {comparator} {e_form_valley}")
    fig = ptable_heatmap_plotly(df_query[Key.formula])
    title = (
        f"Elements in {len(df_query):,} {label} mode MP entries "
        f"({comparator} {e_form_valley} eV/atom)<br><br>"
    )
    title += "<br>".join(
        f"{e_type}: {cnt:,}"
        for e_type, cnt in df_query.energy_type.value_counts().items()
    )

    fig.layout.title = dict(text=title, x=0.4, y=0.94)
    fig.show()
No description has been provided for this image
No description has been provided for this image

Looks like the lower mode is mostly oxides, whereas the higher mode is more diverse also containing many nitrides, sulfides and selenides.

Another way to visualize this are bar charts.

for comparator, label in ((">", "higher"), ("<=", "lower")):
    sub_formulas = df_e_form_gga_only.query(
        f"formation_energy_per_atom {comparator} {e_form_valley}"
    ).formula
    elem_counts = count_elements(sub_formulas)
    fig = px.bar(elem_counts.nlargest(20))
    title = f"{label.title()} mode {comparator} {e_form_valley} eV/atom"
    fig.layout.update(title=dict(text=title, x=0.5), showlegend=False)
    fig.show()
No description has been provided for this image
No description has been provided for this image

This significant lowering of oxide formation energies compared to other anions might at least partially be an artifact of too little experimental data outside oxide systems. In other words, perhaps there should be stronger corrections applied to nitrides, selenides, etc. as well but because there's insufficient experimental data to fit a robust correction scheme there, MP doesn't.