I have a DataFrame with a MultiIndex created after some grouping:
import numpy as np
import pandas as pd
from numpy.random import randn
df = pd.DataFrame({'A' : ['a1', 'a1', 'a2', 'a3'],
'B' : ['b1', 'b2', 'b3', 'b4'],
'Vals' : randn(4)}
).groupby(['A', 'B']).sum()
# Vals
# A B
# a1 b1 -1.632460
# b2 0.596027
# a2 b3 -0.619130
# a3 b4 -0.002009
How do I prepend a level to the MultiIndex so that I turn it into something like:
# Vals
# FirstLevel A B
# Foo a1 b1 -1.632460
# b2 0.596027
# a2 b3 -0.619130
# a3 b4 -0.002009
Answers:
Thank you for visiting the Q&A section on Magenaut. Please note that all the answers may not help you solve the issue immediately. So please treat them as advisements. If you found the post helpful (or not), leave a comment & I’ll get back to you as soon as possible.
Method 1
A nice way to do this in one line using pandas.concat():
import pandas as pd pd.concat([df], keys=['Foo'], names=['Firstlevel'])
An even shorter way:
pd.concat({'Foo': df}, names=['Firstlevel'])
This can be generalized to many data frames, see the docs.
Method 2
You can first add it as a normal column and then append it to the current index, so:
df['Firstlevel'] = 'Foo'
df.set_index('Firstlevel', append=True, inplace=True)
And change the order if needed with:
df.reorder_levels(['Firstlevel', 'A', 'B'])
Which results in:
Vals
Firstlevel A B
Foo a1 b1 0.871563
b2 0.494001
a2 b3 -0.167811
a3 b4 -1.353409
Method 3
I think this is a more general solution:
# Convert index to dataframe old_idx = df.index.to_frame() # Insert new level at specified location old_idx.insert(0, 'new_level_name', new_level_values) # Convert back to MultiIndex df.index = pandas.MultiIndex.from_frame(old_idx)
Some advantages over the other answers:
- The new level can be added at any location, not just the top.
- It is purely a manipulation on the index and doesn’t require manipulating the data, like the concatenation trick.
- It doesn’t require adding a column as an intermediate step, which can break multi-level column indexes.
Method 4
I made a little function out of cxrodgers answer, which IMHO is the best solution since it works purely on an index, independent of any data frame or series.
There is one fix I added: the to_frame() method will invent new names for index levels that don’t have one. As such the new index will have names that don’t exist in the old index. I added some code to revert this name-change.
Below is the code, I’ve used it myself for a while and it seems to work fine. If you find any issues or edge cases, I’d be much obliged to adjust my answer.
import pandas as pd
def _handle_insert_loc(loc: int, n: int) -> int:
"""
Computes the insert index from the right if loc is negative for a given size of n.
"""
return n + loc + 1 if loc < 0 else loc
def add_index_level(old_index: pd.Index, value: Any, name: str = None, loc: int = 0) -> pd.MultiIndex:
"""
Expand a (multi)index by adding a level to it.
:param old_index: The index to expand
:param name: The name of the new index level
:param value: Scalar or list-like, the values of the new index level
:param loc: Where to insert the level in the index, 0 is at the front, negative values count back from the rear end
:return: A new multi-index with the new level added
"""
loc = _handle_insert_loc(loc, len(old_index.names))
old_index_df = old_index.to_frame()
old_index_df.insert(loc, name, value)
new_index_names = list(old_index.names) # sometimes new index level names are invented when converting to a df,
new_index_names.insert(loc, name) # here the original names are reconstructed
new_index = pd.MultiIndex.from_frame(old_index_df, names=new_index_names)
return new_index
It passed the following unittest code:
import unittest
import numpy as np
import pandas as pd
class TestPandaStuff(unittest.TestCase):
def test_add_index_level(self):
df = pd.DataFrame(data=np.random.normal(size=(6, 3)))
i1 = add_index_level(df.index, "foo")
# it does not invent new index names where there are missing
self.assertEqual([None, None], i1.names)
# the new level values are added
self.assertTrue(np.all(i1.get_level_values(0) == "foo"))
self.assertTrue(np.all(i1.get_level_values(1) == df.index))
# it does not invent new index names where there are missing
i2 = add_index_level(i1, ["x", "y"]*3, name="xy", loc=2)
i3 = add_index_level(i2, ["a", "b", "c"]*2, name="abc", loc=-1)
self.assertEqual([None, None, "xy", "abc"], i3.names)
# the new level values are added
self.assertTrue(np.all(i3.get_level_values(0) == "foo"))
self.assertTrue(np.all(i3.get_level_values(1) == df.index))
self.assertTrue(np.all(i3.get_level_values(2) == ["x", "y"]*3))
self.assertTrue(np.all(i3.get_level_values(3) == ["a", "b", "c"]*2))
# df.index = i3
# print()
# print(df)
Method 5
Another answer using from_tuples(). This generalizes this previous answer.
key = "Foo"
name = "First"
# If df.index.nlevels > 1:
df.index = pd.MultiIndex.from_tuples(((key, *item) for item in df.index),
names=[name]+df.index.names)
# If df.index.nlevels == 1:
# df.index = pd.MultiIndex.from_tuples(((key, item) for item in df.index),
# names=[name]+df.index.names)
I like this approach because
- it only modifies the index (no unnecessary copy action of the body)
- it works for both axes (row and column indices)
- it still can be written as a one-liner
Wrapping the above in a function makes it easier to switch between row and column indexes, and between single-level and multi-level indexes:
def prepend_index_level(index, key, name=None):
names = index.names
if index.nlevels==1:
# Sequence of tuples
index = ((item,) for item in index)
tuples_gen = ((key,)+item for item in index)
return pd.MultiIndex.from_tuples(tuples_gen, names=[name]+names)
df.index = prepend_index_level(df.index, key="Foo", name="First")
df.columns = prepend_index_level(df.columns, key="Bar", name="Top")
# Top Bar
# Vals
# First A B
# Foo a1 b1 -0.446066
# b2 -0.248027
# a2 b3 0.522357
# a3 b4 0.404048
Finally, the above can be further generalized by inserting the key at any index level:
def insert_index_level(index, key, name=None, level=0):
def insert_(pos, seq, value):
seq = list(seq)
seq.insert(pos, value)
return tuple(seq)
names = insert_(level, index.names, name)
if index.nlevels==1:
# Sequence of tuples.
index = ((item,) for item in index)
tuples_gen = (insert_(level, item, key) for item in index)
return pd.MultiIndex.from_tuples(tuples_gen, names=names)
df.index = insert_index_level(df.index, key="Foo", name="Last", level=2)
df.columns = insert_index_level(df.columns, key="Bar", name="Top", level=0)
# Top Bar
# Vals
# A B Last
# a1 b1 Foo -0.595949
# b2 Foo -1.621233
# a2 b3 Foo -0.748917
# a3 b4 Foo 2.147814
Method 6
How about building it from scratch with pandas.MultiIndex.from_tuples?
df.index = p.MultiIndex.from_tuples(
[(nl, A, B) for nl, (A, B) in
zip(['Foo'] * len(df), df.index)],
names=['FirstLevel', 'A', 'B'])
Similarly to cxrodger’s solution, this is a flexible method and avoids modifying the underlying array for the dataframe.
All methods was sourced from stackoverflow.com or stackexchange.com, is licensed under cc by-sa 2.5, cc by-sa 3.0 and cc by-sa 4.0