@@ -3520,70 +3520,138 @@ def join(
35203520 * ,
35213521 on : Optional [str ] = None ,
35223522 how : str = "left" ,
3523+ lsuffix : str = "" ,
3524+ rsuffix : str = "" ,
35233525 ) -> DataFrame :
35243526 if isinstance (other , bigframes .series .Series ):
35253527 other = other .to_frame ()
35263528
35273529 left , right = self , other
35283530
3529- if not left .columns .intersection (right .columns ).empty :
3530- raise NotImplementedError (
3531- f"Deduping column names is not implemented. { constants .FEEDBACK_LINK } "
3532- )
3531+ col_intersection = left .columns .intersection (right .columns )
3532+
3533+ if not col_intersection .empty :
3534+ if lsuffix == rsuffix == "" :
3535+ raise ValueError (
3536+ f"columns overlap but no suffix specified: { col_intersection } "
3537+ )
3538+
35333539 if how == "cross" :
35343540 if on is not None :
35353541 raise ValueError ("'on' is not supported for cross join." )
35363542 result_block = left ._block .merge (
35373543 right ._block ,
35383544 left_join_ids = [],
35393545 right_join_ids = [],
3540- suffixes = ("" , "" ),
3546+ suffixes = (lsuffix , rsuffix ),
35413547 how = "cross" ,
35423548 sort = True ,
35433549 )
35443550 return DataFrame (result_block )
35453551
35463552 # Join left columns with right index
35473553 if on is not None :
3554+ if left ._has_index and (on in left .index .names ):
3555+ if on in left .columns :
3556+ raise ValueError (
3557+ f"'{ on } ' is both an index level and a column label, which is ambiguous."
3558+ )
3559+ else :
3560+ raise NotImplementedError (
3561+ f"Joining on index level '{ on } ' is not yet supported. { constants .FEEDBACK_LINK } "
3562+ )
3563+ if (left .columns == on ).sum () > 1 :
3564+ raise ValueError (f"The column label '{ on } ' is not unique." )
3565+
35483566 if other ._block .index .nlevels != 1 :
35493567 raise ValueError (
35503568 "Join on columns must match the index level of the other DataFrame. Join on column with multi-index haven't been supported."
35513569 )
3552- # Switch left index with on column
3553- left_columns = left .columns
3554- left_idx_original_names = left .index .names if left ._has_index else ()
3555- left_idx_names_in_cols = [
3556- f"bigframes_left_idx_name_{ i } "
3557- for i in range (len (left_idx_original_names ))
3558- ]
3559- if left ._has_index :
3560- left .index .names = left_idx_names_in_cols
3561- left = left .reset_index (drop = False )
3562- left = left .set_index (on )
3563-
3564- # Join on index and switch back
3565- combined_df = left ._perform_join_by_index (right , how = how )
3566- combined_df .index .name = on
3567- combined_df = combined_df .reset_index (drop = False )
3568- combined_df = combined_df .set_index (left_idx_names_in_cols )
3569-
3570- # To be consistent with Pandas
3571- if combined_df ._has_index :
3572- combined_df .index .names = (
3573- left_idx_original_names
3574- if how in ("inner" , "left" )
3575- else ([None ] * len (combined_df .index .names ))
3576- )
35773570
3578- # Reorder columns
3579- combined_df = combined_df [list (left_columns ) + list (right .columns )]
3580- return combined_df
3571+ return self ._join_on_key (
3572+ other ,
3573+ on = on ,
3574+ how = how ,
3575+ lsuffix = lsuffix ,
3576+ rsuffix = rsuffix ,
3577+ should_duplicate_on_key = (on in col_intersection ),
3578+ )
35813579
35823580 # Join left index with right index
35833581 if left ._block .index .nlevels != right ._block .index .nlevels :
35843582 raise ValueError ("Index to join on must have the same number of levels." )
35853583
3586- return left ._perform_join_by_index (right , how = how )
3584+ return left ._perform_join_by_index (right , how = how )._add_join_suffix (
3585+ left .columns , right .columns , lsuffix = lsuffix , rsuffix = rsuffix
3586+ )
3587+
3588+ def _join_on_key (
3589+ self ,
3590+ other : DataFrame ,
3591+ on : str ,
3592+ how : str ,
3593+ lsuffix : str ,
3594+ rsuffix : str ,
3595+ should_duplicate_on_key : bool ,
3596+ ) -> DataFrame :
3597+ left , right = self .copy (), other
3598+ # Replace all columns names with unique names for reordering.
3599+ left_col_original_names = left .columns
3600+ on_col_name = "bigframes_left_col_on"
3601+ dup_on_col_name = "bigframes_left_col_on_dup"
3602+ left_col_temp_names = [
3603+ f"bigframes_left_col_name_{ i } " if col_name != on else on_col_name
3604+ for i , col_name in enumerate (left_col_original_names )
3605+ ]
3606+ left .columns = pandas .Index (left_col_temp_names )
3607+ # if on column is also in right df, we need to duplicate the column
3608+ # and set it to be the first column
3609+ if should_duplicate_on_key :
3610+ left [dup_on_col_name ] = left [on_col_name ]
3611+ on_col_name = dup_on_col_name
3612+ left_col_temp_names = [on_col_name ] + left_col_temp_names
3613+ left = left [left_col_temp_names ]
3614+
3615+ # Switch left index with on column
3616+ left_idx_original_names = left .index .names if left ._has_index else ()
3617+ left_idx_names_in_cols = [
3618+ f"bigframes_left_idx_name_{ i } " for i in range (len (left_idx_original_names ))
3619+ ]
3620+ if left ._has_index :
3621+ left .index .names = left_idx_names_in_cols
3622+ left = left .reset_index (drop = False )
3623+ left = left .set_index (on_col_name )
3624+
3625+ right_col_original_names = right .columns
3626+ right_col_temp_names = [
3627+ f"bigframes_right_col_name_{ i } "
3628+ for i in range (len (right_col_original_names ))
3629+ ]
3630+ right .columns = pandas .Index (right_col_temp_names )
3631+
3632+ # Join on index and switch back
3633+ combined_df = left ._perform_join_by_index (right , how = how )
3634+ combined_df .index .name = on_col_name
3635+ combined_df = combined_df .reset_index (drop = False )
3636+ combined_df = combined_df .set_index (left_idx_names_in_cols )
3637+
3638+ # To be consistent with Pandas
3639+ if combined_df ._has_index :
3640+ combined_df .index .names = (
3641+ left_idx_original_names
3642+ if how in ("inner" , "left" )
3643+ else ([None ] * len (combined_df .index .names ))
3644+ )
3645+
3646+ # Reorder columns
3647+ combined_df = combined_df [left_col_temp_names + right_col_temp_names ]
3648+ return combined_df ._add_join_suffix (
3649+ left_col_original_names ,
3650+ right_col_original_names ,
3651+ lsuffix = lsuffix ,
3652+ rsuffix = rsuffix ,
3653+ extra_col = on if on_col_name == dup_on_col_name else None ,
3654+ )
35873655
35883656 def _perform_join_by_index (
35893657 self ,
@@ -3597,6 +3665,59 @@ def _perform_join_by_index(
35973665 )
35983666 return DataFrame (block )
35993667
3668+ def _add_join_suffix (
3669+ self ,
3670+ left_columns ,
3671+ right_columns ,
3672+ lsuffix : str = "" ,
3673+ rsuffix : str = "" ,
3674+ extra_col : typing .Optional [str ] = None ,
3675+ ):
3676+ """Applies suffixes to overlapping column names to mimic a pandas join.
3677+
3678+ This method identifies columns that are common to both a "left" and "right"
3679+ set of columns and renames them using the provided suffixes. Columns that
3680+ are not in the intersection are kept with their original names.
3681+
3682+ Args:
3683+ left_columns (pandas.Index):
3684+ The column labels from the left DataFrame.
3685+ right_columns (pandas.Index):
3686+ The column labels from the right DataFrame.
3687+ lsuffix (str):
3688+ The suffix to apply to overlapping column names from the left side.
3689+ rsuffix (str):
3690+ The suffix to apply to overlapping column names from the right side.
3691+ extra_col (typing.Optional[str]):
3692+ An optional column name to prepend to the final list of columns.
3693+ This argument is used specifically to match the behavior of a
3694+ pandas join. When a join key (i.e., the 'on' column) exists
3695+ in both the left and right DataFrames, pandas creates two versions
3696+ of that column: one copy keeps its original name and is placed as
3697+ the first column, while the other instances receive the normal
3698+ suffix. Passing the join key's name here replicates that behavior.
3699+
3700+ Returns:
3701+ DataFrame:
3702+ A new DataFrame with the columns renamed to resolve overlaps.
3703+ """
3704+ combined_df = self .copy ()
3705+ col_intersection = left_columns .intersection (right_columns )
3706+ final_col_names = [] if extra_col is None else [extra_col ]
3707+ for col_name in left_columns :
3708+ if col_name in col_intersection :
3709+ final_col_names .append (f"{ col_name } { lsuffix } " )
3710+ else :
3711+ final_col_names .append (col_name )
3712+
3713+ for col_name in right_columns :
3714+ if col_name in col_intersection :
3715+ final_col_names .append (f"{ col_name } { rsuffix } " )
3716+ else :
3717+ final_col_names .append (col_name )
3718+ combined_df .columns = pandas .Index (final_col_names )
3719+ return combined_df
3720+
36003721 @validations .requires_ordering ()
36013722 def rolling (
36023723 self ,
0 commit comments