
    g%Y                        d dl Z d dlZ	 d dlZd dlZd dlZd dlm	Z	m
Z
 d dlmZ d dlmZ 	 d dlmZ d dlmZmZmZ 	 d dlZd dlmZ d dlmZmZ ej:                  j                   Zej:                  j,                  d        Zej:                  j,                  d        Z ej:                  j,                  d	        Z!ej:                  j,                  d
        Z"ej:                  j,                  d        Z#ej:                  j,                  d        Z$ej:                  j,                  d        Z%ej:                  j,                  d        Z&ej:                  j,                  d        Z'ej:                  j,                  d        Z(ej:                  j,                  ej:                  jS                  d      d               Z*ej:                  j,                  d        Z+ej:                  j,                  d        Z,ej:                  j,                  d        Z-ej:                  j,                  d        Z.ej:                  j,                  d        Z/ej:                  j,                  d        Z0ej:                  j,                  d        Z1ej:                  j,                  d        Z2ej:                  j,                  d        Z3ej:                  j,                  d        Z4ej:                  j,                  d        Z5ej:                  j,                  d        Z6ej:                  j,                  d        Z7ej:                  j,                  ej:                  jq                  d g d!      ej:                  jq                  d"d#d$g      d%                      Z9ej:                  j,                  d&        Z:ej:                  j,                  d'        Z;y# e$ r dZY w xY w# e$ r dZY w xY w# e$ r dxZZY w xY w)(    N)LocalFileSystemSubTreeFileSystem)guid)Version)_read_table_test_dataframe_write_table)_roundtrip_pandas_dataframealltypes_samplec                 j   t        d      }| dz  }t        j                  j                  |      }d|j                  j
                  v sJ t        ||       t        j                  |      j
                  }d|v sJ t        j                  |d   j                  d            }|d   dd ddd	d
gk(  sJ y )N'  sizepandas_roundtrip.parquets   pandasutf8index_columnsranger      )kindnamestartstopstep)r   paTablefrom_pandasschemametadatar	   pqread_metadatajsonloadsdecode)tempdirdffilenamearrow_tabler   jss         V/var/www/openai/venv/lib/python3.12/site-packages/pyarrow/tests/parquet/test_pandas.py#test_pandas_parquet_custom_metadatar*   7   s    	e	$B33H((&&r*K**33333h')22H   	HY'..v6	7BoG,0-.,-$/ #0 0 0 0    c           	         t        j                  t        j                  dt        j                               t        j                  dt        j                               t        j                  dt        j
                               g      }t        j                  t        j                  dt        j                        t        j                  dt        j                        g dd      }t        j                  dd	gd
d gd d gd      }t         j                  j                  ||d      }t         j                  j                  ||d      }|j                  j                  |j                  d      rJ |j                  j                  |j                        sJ t        j                  | dz  |      }|j!                  |       |j!                  |       y )Nintfloatstring   dtype)ABBAEDDAACDC)r-   r.   r/         g?F)r   preserve_indexT)check_metadatazmerged.parquet)r   )r   r   fieldint16float32r/   pd	DataFramenparangeuint8r   r   equalsr   ParquetWriterwrite_table)r$   r   df1df2table1table2writers          r)   :test_merging_parquet_tables_with_different_pandas_metadatarJ   K   sb    YY

#
"**,'
299;' F
 ,,yy"((+1BJJ/* C
 ,,1vt, C
 XX!!#fU!KFXX!!#fU!KF}}##FMM$#GGG==...g(88HF
v
vr+   c                    t        d      }t        j                  j                  t	        t        |j                  |j                  d d d               ddg      |_        | dz  }t        j                  j                  |      }|j                  j                  J t        ||       t        j                  |      }|j                         }t!        j"                  ||       y )N
   r   level_1level_2namesr   )r   r=   
MultiIndexfrom_tupleslistzipcolumnsr   r   r   r   pandas_metadatar	   r   read_pandas	to_pandastmassert_frame_equal)r$   r%   r&   r'   
table_readdf_reads         r)   %test_pandas_parquet_column_multiindexr^   h   s    	b	!B**SRZZ"-./)$ + BJ
 33H((&&r*K--999h')J""$G"g&r+   c                    t        d      }| dz  }t        j                  j                  |d      }|j                  j
                  }|d   rJ |d   sJ t        ||       t        j                  |      }|j                  j
                  }|d   rJ |j                  j                  }|j                  j                  |k(  sJ |j                         }t        j                  ||       y )Nr   r   r   Fr8   r   rV   )r   r   r   r   r   rW   r	   r   rX   r   rY   rZ   r[   )r$   r%   r&   r'   r(   r\   r    r]   s           r)   >test_pandas_parquet_2_0_roundtrip_read_pandas_no_index_writtenra   {   s    	e	$B33H((&&r%&@K				+	+B/""" i==h')J				*	*B/"""%%..M&&-777""$G"g&r+   c                  D   t        d      } t        j                  j                  |       }t        j                         }t        ||d       |j                         }t        j                  |      }t        |      j                         }t        j                  | |       y )Nr   2.6versionr   r   r   r   BufferOutputStreamr	   getvalueBufferReaderr   rY   rZ   r[   r%   r'   imosbufreaderr]   s         r)   )test_pandas_parquet_native_file_roundtriprn      sv    		B((&&r*K  "DdE2
--/C__S!F&!++-G"g&r+   c                  j   t        d      } t        j                  j                  |       }t        j                         }t        ||d       |j                         }t        j                  |      }t        j                  |ddg      j                         }t        j                  | ddg   |       y )Nr   rc   rd   stringsrA   rV   )r   r   r   r   rg   r	   rh   ri   r   rX   rY   rZ   r[   rj   s         r)   test_read_pandas_column_subsetrr      s    		B((&&r*K  "DdE2
--/C__S!FnnG,ik  "i12G<r+   c                  D   t        d      } t        j                  j                  |       }t        j                         }t        ||d       |j                         }t        j                  |      }t        |      j                         }t        j                  | |       y )Nr   rc   rd   rf   rj   s         r)   #test_pandas_parquet_empty_roundtriprt      sv    		B((&&r*K  "DdE2
--/C__S!F&!++-G"g&r+   c                      ddiddiddigdd} t        j                  |       }t        j                  j	                  |      }t        j
                         }t        ||       y )	N	page_typer   record_typenon_consecutive_homer   1001)agg_col	uid_first)data)r=   r>   r   r   r   rg   r	   )r|   r%   r'   rk   s       r)   !test_pandas_can_write_nested_datar}      si     !A#Q'

 D 
4	 B((&&r*K  "Dd#r+   c                    | dz  }d}t        j                  t        j                  |t        j                        t        j                  |t        j
                        t        j                  |t        j                        t        j                  j                  |      dkD  g dd      }t        j                  j                  |      }|j                  d      5 }t        ||d	       d d d        t        j                  |j!                               }t#        |      }|j%                         }t'        j(                  ||       y # 1 sw Y   ^xY w)
Nzpandas_pyfile_roundtrip.parquetr7   r1   r   )foobarNbazqux)int64r<   float64boolrp   wbrc   rd   )r=   r>   r?   r@   r   r<   r   randomrandnr   r   r   openr	   ioBytesIO
read_bytesr   rY   rZ   r[   )	r$   r&   r   r%   r'   fr|   r\   r]   s	            r)   $test_pandas_parquet_pyfile_roundtripr      s    ::HD	4rxx099T499T4		%)5 
B ((&&r*K	t	[!U3 
 ::h))+,DT"J""$G"g& 
	s   E

Ec                    d}t         j                  j                  d       t        j                  t        j
                  |t         j                        t        j
                  |t         j                        t        j
                  |t         j                        t        j
                  |t         j                        t        j
                  |t         j                        t        j
                  |t         j                        t        j
                  |t         j                        t        j
                  |t         j                        t        j
                  |t         j                        t        j
                  |t         j                        t         j                  j                  |      dkD  d      }| dz  }t         j"                  j%                  |      }dD ]B  }t'        ||d|       t)        |      }|j+                         }t-        j.                  ||       D dD ]B  }t'        ||d|	       t)        |      }|j+                         }t-        j.                  ||       D d
D ]q  }	|	dk7  r*t         j0                  j2                  j5                  |	      s2t'        ||d|	       t)        |      }|j+                         }t-        j.                  ||       s y )Nr   r   r1   )rA   uint16uint32uint64int8r;   int32r   r<   r   r   r   )TFrc   )re   use_dictionary)re   write_statistics)NONESNAPPYGZIPLZ4ZSTDr   )re   compression)r?   r   seedr=   r>   r@   rA   r   r   r   r;   r   r   r<   r   r   r   r   r   r	   r   rY   rZ   r[   libCodecis_available)
r$   r   r%   r&   r'   r   r\   r]   r   r   s
             r)   )test_pandas_parquet_configuration_optionsr      s   DIINN1	4rxx0))D		2))D		2))D		2		$bhh/4rxx04rxx04rxx099T499T4		%) 
B 33H((&&r*K'[(E$2	4 *
&&(
b'* ( *[(E&6	8 *
&&(
b'* * A6!FFLL--k:[(E!,	. *
&&(
b'* Ar+   z)ignore:Parquet format '2.0':FutureWarningc                      t        d      } t        j                  ddt        |       z  d      | _        d| j                  _        t        | ddd      }t        j                  ||        y )	Nd   r   r   rL   r   z2.0spark)re   flavor)	r   r?   r@   lenindexr   r
   rZ   r[   )r%   results     r)   +test_spark_flavor_preserves_pandas_metadatar     s]     
c	"ByyBRL"-BHBHHM(8?.A BF&"%r+   c                 &   t        j                  d      dt        j                  d      dit        j                  d      t        j                  d      t        j                  d      t        j                  d      id}t        | dz        }t        j                  |d      j	                  dd	
      }t
        j                  j                  |      }t        ||       t        |      }|j                         }t        j                  ||       y )Nz2017-06-30 01:31:00g*_c@z2017-06-30 01:32:00)closetimedata.parquetzdatetime64[us]r1   r   Fdrop)r=   	Timestampstrr>   	set_indexr   r   r   r	   r   rY   rZ   r[   )r$   r|   pathdfxtdfxr'   	result_dfs          r)    test_index_column_name_duplicater     s     LL./1CLL./1C

 LL./%2 LL./%2	
D w'(D
 ,,t#3
4
>
>vE
>
RC88$Dtd#K%%'I)S)r+   c                    d}t        t        |            }t        j                  j	                  g d|gddg      }t        j
                  d|i|      }t        j                  j                  |      }| dz  }t        ||       t        |      }|j                  |      sJ |j                         }t        j                  ||       y )	Nr0   )r   r   r   foobarsome_numbersrP   numbers)r   zdup_multi_index_levels.parquet)rT   r   r=   rR   from_arraysr>   r   r   r   r	   r   rB   rY   rZ   r[   )	r$   num_rowsr   r   r%   tabler&   result_tabler   s	            r)    test_multiindex_duplicate_valuesr   :  s    H5?#GMM%%	(( & E
 
y'*%	8BHH  $E99H!x(L<<%%%&&(I)R(r+   c                     d}t        j                  t        j                  |      dd dd      }t	        | dz        }|j                         }t        j                  ||       y )N  carat        cut  color  clarity  depth  table  price     x     y     z
 0.23      Ideal      E      SI2   61.5   55.0    326  3.95  3.98  2.43
 0.21    Premium      E      SI1   59.8   61.0    326  3.89  3.84  2.31
 0.23       Good      E      VS1   56.9   65.0    327  4.05  4.07  2.31
 0.29    Premium      I      VS2   62.4   58.0    334  4.20  4.23  2.63
 0.31       Good      J      SI2   63.3   58.0    335  4.34  4.35  2.75
 0.24  Very Good      J     VVS2   62.8   57.0    336  3.94  3.96  2.48
 0.24  Very Good      I     VVS1   62.3   57.0    336  3.95  3.98  2.47
 0.26  Very Good      H      SI1   61.9   55.0    337  4.07  4.11  2.53
 0.22       Fair      E      VS2   65.1   61.0    337  3.87  3.78  2.49
 0.23  Very Good      H      VS1   59.4   61.0    338  4.00  4.05  2.39\s{2,}r   pythonsep	index_colheaderenginezv0.7.1.parquet)r=   read_csvr   r   r   rY   rZ   r[   datadirexpected_stringexpectedr   r   s        r)   &test_backwards_compatible_index_namingr   P  s[    KO {{2::o6I%)!HFH"223E__F&(+r+   c                     d}t        j                  t        j                  |      dg ddd      j	                         }t        | dz        }|j                         }t        j                  ||       y )Nr   r   cutcolorclarityr   r   r   zv0.7.1.all-named-index.parquet)	r=   r   r   r   
sort_indexr   rY   rZ   r[   r   s        r)   1test_backwards_compatible_index_multi_level_namedr   e  sj    KO {{


?#- jl	  "BBCE__F&(+r+   c                 .   d}t        j                  t        j                  |      dg ddd      j	                         }|j
                  j                  g d      |_        t        | dz        }|j                         }t        j                  ||       y )	Nr   r   r   r   r   r   )r   Nr   zv0.7.1.some-named-index.parquet)r=   r   r   r   r   r   	set_namesr   rY   rZ   r[   r   s        r)   6test_backwards_compatible_index_multi_level_some_namedr   ~  s    KO {{


?#!< jl	 
 ^^--.FGHN"CCDE__F&(+r+   c           	      ^   t        d      t        t        j                        k  rt        j                  d       t        j
                  g dg dt        j                  ddd      d	      }t        j                  j                  g d	t        j                  ddd      gd
d g      |_	        | dz  }t        |      }|j                         }t        j                  ||       t        |dg      }|j                         }t        j                  ||dg   j                  d             y )Nz2.2.0zRegression in pandas 2.2.0r      r0   )g?g?g333333?z
2017-01-01r0   zEurope/Brussels)periodstzabcr   rP   z'v0.7.1.column-metadata-handling.parquetr   rq   Tr   )r   r=   __version__pytestskipr>   
date_rangerR   r   r   r   rY   rZ   r[   reset_index)r   r   r   r   r   s        r)   2test_backwards_compatible_column_metadata_handlingr     s   w72>>22 	01||lmmL!8IJ	LMH ]]..		|Q3D	E	Go / HN
 >>DE__F&(+seE__F&(C5/"="=4"="HIr+   c                  "   t        j                  ddgddggddg      } | d   j                  d      | d<   | j                  dg      } t        j
                  j                  |       }t	        j                         }t        j                  ||       t        j                  |j                               j                         }t        |j                  t         j                        sJ |j                  j!                  | j                        sJ y )	Nr   r   r   dc1c2rq   category)r=   r>   astyper   r   r   r   rg   r   rD   rX   rh   rY   
isinstancer   CategoricalIndexrB   )r%   r   bosref_dfs       r)   )test_categorical_index_survives_roundtripr     s     
SzC:.t	EB$xz*BtH	tf	BHH  $E



!CNN5#^^CLLN+557FfllB$7$7888<<rxx(((r+   c                     t        j                  dt        j                  g dg dd      i      } t        j                  j                  |       }t        j                         }t        j                  ||       |j                         }t        j                  |      j                         }t        j                  ||        y )Nr   )r   r   r   r   )r   r   r   T)
categoriesordered)r=   r>   Categoricalr   r   r   rg   r   rD   rh   rX   rY   rZ   r[   )r%   r   r   contentsr   s        r)   )test_categorical_order_survives_roundtripr     s     
sBNN$H I 
JB HH  $E



!CNN5#||~H^^H%//1F&"%r+   c                     t        j                  d gdz  dgdz  d      } | j                  ddd      }t        j                  j                  |       }t        j                  j                  |      }t        j                         }t        j                  ||dd       t        j                  |j                               }|d   j                  |d         sJ |d	   j                  |d	         sJ y )
Nr   g      ?)colr-   r   rc   rL   )re   
chunk_sizer   r   )r=   r>   r   r   r   r   rg   r   rD   
read_tablerh   rB   )r%   df_categoryr   	table_catrl   r   s         r)   *test_pandas_categorical_na_type_row_groupsr     s     
tfslC53;?	@B))JzBCKHH  $E$$[1I



!C NN9c5R@]]3<<>*F !9E!H%%%!9E!H%%%r+   c                  N   t        j                  g dd      } g d}t        j                  dt        j                  j                  | |      i      }t        j                         }t        j                  t        j                  |      |       t        j                  |j                               j                         }|j                  j                  dk(  sJ |j                  j                   j"                  |k(  j%                         sJ t'        j(                  ||       y )N)r   r   r   r   r   rM   r   r   r1   )r   r   r   x)r   r   )r?   arrayr=   r>   r   
from_codesr   rg   r   rD   r   r   rh   rY   r  r2   catr   allrZ   r[   )codesr   r%   rl   r   s        r)   !test_pandas_categorical_roundtripr    s    
 HH+7;E&J	sBNN55* 6 & ' 
(B 


!CNN288B<%]]3<<>*446F88>>Z'''HHLL##z166888&"%r+   c                    t        t        j                        t        d      k  rt        j                  d       t        j
                  dg did      }|j                  d      }t        j
                  dg di      }|j                  d      }t        j                  |d         j                         t        j                  |d         j                         k(  sJ t        j                  |d   j                  j                  j                        j                         t        j                  |d   j                  j                  j                        j                         k(  sJ t        | dz        }t        j                  t        j                   |      |       t        j"                  |      j%                         }t'        j(                  ||       y )	Nz1.3.0z:PyArrow backed string data type introduced in pandas 1.3.0r  )r   r   r   zstring[pyarrow]r1   r   zcat.parquet)r   r=   r   r   r   r>   r   r   r  	to_pylistr  r   valuesr   r   rD   r   r   rY   rZ   r[   )r$   rE   rF   r   r   s        r)   )test_categories_with_string_pyarrow_dtyper    sZ    r~~!11PQ
,,23;L
MC
**Z
 C
,,23
4C
**Z
 C 88CH'')RXXc#h-?-I-I-KKKK88CHLL++223==?288C&&D((1	4 4 4 w&'DNN288C=$']]4 **,F&#&r+   c                 0   t        j                  dg dd      }|d   j                  d      |d<   t        j                  |      }t        j                  |t        | dz        dg       t        j                  t        | dz              j                         }t        j                  |dg   |dg          t        j                  |t        | d	z               t        j                  t        | d	z              j                         }t        j                  |dg   |dg          t        j                  |t        | d
z               t        j                  t        | d
z              j                         }t        j                  |dg   |dg          y )Nr   r   partr   r   Int64case1r  partition_colscase2r   )r=   r>   r   r   r   r   write_to_datasetr   r   rY   rZ   r[   rD   )r$   r%   r   r   s       r)   5test_write_to_dataset_pandas_preserve_extensiondtypesr    sF   	s95	6B5	  )BuIHHRLEs7W$%vh ]]3w012<<>F&%/2ug;7s7W#456]]3w012<<>F&%/2ug;7NN5#g678]]3w789CCEF&%/2ug;7r+   c                 j   t        j                  g dg dd      }t        j                  g dd      |_        t	        j
                  |      }|ddg   j                         }|d   j                  d	      |d<   t        j                  |t        | d
z        dg       t        j                  t        | d
z              j                         }t        j                  ||       t        j                  |t        | dz               t        j                  t        | dz              j                         }t        j                  ||       t        j                  |t        | dz               t        j                  t        | dz              j                         }t        j                  ||       y )N)r   r   r   r   r  r   idxr   r   r  r   r  r  r  r   )r=   r>   Indexr   r   r   copyr   r   r  r   r   rY   rZ   r[   rD   )r$   r%   r   df_catr   s        r)   +test_write_to_dataset_pandas_preserve_indexr  "  sM    
yA	BBxxe4BHHHRLE %%'FF^**:6F6Ns7W$%vh ]]3w012<<>F&&)s7W#456]]3w012<<>F&"%NN5#g678]]3w789CCEF&"%r+   r8   )TFNmetadata_fname	_metadata_common_metadatac                    d}d}| t               z  }|j                          g }g }g }t        |      D ]  }	t        ||	      }
t	        j
                  t        j                  |	|z  |	dz   |z  d      d      |
_        |dj                  |	      z  }t        j                  j                  |
|	      }|j                  d       }|j                  j                  J t!        ||       |j#                  |       |j#                  |
       |j#                  |        t        j                  j                  
|	      }t%        j&                  |j                  ||z         t%        j(                  |      }d
dg}|j+                  |      j-                         }t	        j.                  |D cg c]  }||   	 c}      }|dur|
j                  j0                  nd |j                  _        t3        j4                  ||       y c c}w )Nr7   )r   r   r   r1   r   r  z
{}.parquetr`   rA   rp   rq   F)r   mkdirr   r   r=   r  r?   r@   r   formatr   r   r   replace_schema_metadatar   r   r	   appendr   write_metadataParquetDatasetrX   rY   concatr   rZ   r[   )r$   r8   r  nfilesr   dirpath	test_dataframespathsir%   r   r   table_for_metadatadatasetrV   r   r  r   s                      r)   (test_dataset_read_pandas_common_metadatar1  ;  s    FDGMMOIFE6]T*88IIa$hQ$g>W
 ,,Q//$$R$G --d3||$$,,,UD!bT# ( --
> .  (//>1IJ(G	"G   1;;=Fyyf5f!G*f56H'u4$ NN&(+ 6s   -Hc                    t        j                  dg di      }| dz  }t        ||       t        j                  dt        t        |       t                           }|j                  t        j                  |            sJ y )Nr   r   r   )
filesystem)r=   r>   r	   r   rX   r   r   r   rB   r   r   )r$   r%   r&   r   s       r)   %test_read_pandas_passthrough_keywordsr4  m  si     
sI&	'B'HX^^$S\?3DEF =="&&&r+   c                 z   t        j                  t        j                  ddgddgg      t        j                  ddg      d      }| dz  }t        j                  t        j
                         t        j
                               }t        j                  t        j                  d	|      t        j                  d
t        j
                               g      }t        j                  j                  ||      }t        ||       t        j                  |      j                         }t        j                  ||       y )N)id	something)value2else)r6  
something2)valueelse2r   r   )col1col2r   r=  r>  )r=   r>   Seriesr   map_r/   r   r:   r   r   r	   r   rX   rY   rZ   r[   )r$   r%   r&   udtr   r'   r   s          r)   test_read_pandas_map_fieldsrB  }  s     
		 "45!#56
  		5%.) 
B 'H
''"))+ryy{
+CYY-rxx		/LMNF((&&r62Kh'^^H%//1F&"%r+   )<r   r!   numpyr?   ImportErrorr   pyarrowr   
pyarrow.fsr   r   pyarrow.utilr   pyarrow.vendored.versionr   pyarrow.parquetparquetr   pyarrow.tests.parquet.commonr   r   r	   pandasr=   pandas.testingtestingrZ   r
   r   mark
pytestmarkr*   rJ   r^   ra   rn   rr   rt   r}   r   r   filterwarningsr   r   r   r   r   r   r   r   r   r   r  r  r  r  parametrizer1  r4  rB   r+   r)   <module>rT     s"  $ 
    9  , < <? [[  
 0 0&  8 ' '$ ' '0 ' ' 
= 
= ' ' $ $  ' '. )+ )+X GH& I & * *: ) )* , ,( , ,0 , ,2 J J0 ) ) & & & &" & &$ ' '. 8 8( & &0 )+>?)K9K+LM,, N @ ,,^ ' ' & &O  	B  	B  NBs3   N9 O O 9OOOO	O"!O"