
    g                      ~    d dl Z d dlZd dlZ	 d dlZd dlmZ  G d de      Zd Ze	dk(  r e        yy# e$ r Y )w xY w)    N)VectorSpaceClustererc                   Z    e Zd ZdZ	 	 	 	 	 	 	 ddZddZddZd Zd Zd Z	d	 Z
d
 Zd Zy)KMeansClusterera  
    The K-means clusterer starts with k arbitrary chosen means then allocates
    each vector to the cluster with the closest mean. It then recalculates the
    means of each cluster as the centroid of the vectors in the cluster. This
    process repeats until the cluster memberships stabilise. This is a
    hill-climbing algorithm which may converge to a local maximum. Hence the
    clustering is often repeated with random initial means and the most
    commonly occurring output means are chosen.
    Nc
                    t        j                  | ||       || _        || _        || _        |rt        |      |k(  sJ || _        |dk\  sJ |r|dkD  rJ || _        |r|nt        j                         | _
        |	| _        y)a  
        :param  num_means:  the number of means to use (may use fewer)
        :type   num_means:  int
        :param  distance:   measure of distance between two vectors
        :type   distance:   function taking two vectors and returning a float
        :param  repeats:    number of randomised clustering trials to use
        :type   repeats:    int
        :param  conv_test:  maximum variation in mean differences before
                            deemed convergent
        :type   conv_test:  number
        :param  initial_means: set of k initial means
        :type   initial_means: sequence of vectors
        :param  normalise:  should vectors be normalised to length 1
        :type   normalise:  boolean
        :param svd_dimensions: number of dimensions to use in reducing vector
                               dimensionsionality with SVD
        :type svd_dimensions: int
        :param  rng:        random number generator (or None)
        :type   rng:        Random
        :param avoid_empty_clusters: include current centroid in computation
                                     of next one; avoids undefined behavior
                                     when clusters become empty
        :type avoid_empty_clusters: boolean
           N)r   __init__
_num_means	_distance_max_differencelen_means_repeatsrandomRandom_rng_avoid_empty_clusters)
self	num_meansdistancerepeats	conv_testinitial_means	normalisesvd_dimensionsrngavoid_empty_clusterss
             H/var/www/openai/venv/lib/python3.12/site-packages/nltk/cluster/kmeans.pyr   zKMeansClusterer.__init__    s    H 	%%dI~F#!( C$6)$CCC#!||!gk22CFMMO	%9"    c                    | j                   r| j                  dkD  rt        d       g }t        | j                        D ]  }|rt        d|       | j                   r|dkD  r4| j                  j                  t        |      | j                        | _         | j                  ||       |j                  | j                           t        |      dkD  r|D ]  }|j                  t                d x}}t        t        |            D ]M  }d}	t        t        |            D ]#  }
||
k7  s	|	| j                  ||   ||
         z  }	% ||	|k  sG|	||   }}O || _         y y )Nr   z6Warning: means will be discarded for subsequent trialszk-means trial)keyr   )r   r   printranger   samplelistr	   _cluster_vectorspaceappendr   sortsum_sum_distances)r   vectorstracemeansstrialmeansmin_difference	min_meansidjs              r   cluster_vectorspacez#KMeansClusterer.cluster_vectorspaceP   s=   ;;4==1,JK4==)Eou-;;%!)"ii..tG}dooN%%gu5MM$++& * v;?  

s
#   *.-NY3v;'s6{+AAvT00F1IFF , ")Q-?016!9IN ( $DK# r   c                    | j                   t        |      k  rd}|st        | j                         D cg c]  }g  }}|D ]'  }| j                  |      }||   j	                  |       ) |rt        d       t        t        | j                  || j                              }| j                  | j                  |      }	|	| j                  k  rd}|| _	        |sy y y c c}w )NF	iterationT)r	   r   r"   classify_vectorspacer&   r!   r$   map	_centroidr   r)   r   )
r   r*   r+   	convergedmclustersvectorindex	new_means
differences
             r   r%   z$KMeansClusterer._cluster_vectorspacep   s    ??S\)I ).doo(>?(>1B(>?%F 55f=EUO**62 & +&
 !T^^Xt{{!KL	 "00iH
 4 44 $I (-   * @s   	Cc                     d x}}t        t        | j                              D ]/  }| j                  |   }| j                  ||      }|||k  s,||}}1 |S N)r"   r   r   r
   )r   r=   best_distance
best_indexr>   meandists          r   r7   z$KMeansClusterer.classify_vectorspace   sb     &*)
3t{{+,E;;u%D>>&$/D$}(<,14M
	 -
 r   c                 \    | j                   rt        | j                         S | j                  S rB   )r   r   r	   r   s    r   num_clusterszKMeansClusterer.num_clusters   s"    ;;t{{##??"r   c                     | j                   S )z0
        The means used for clustering.
        )r   rH   s    r   r.   zKMeansClusterer.means   s     {{r   c                 \    d}t        ||      D ]  \  }}|| j                  ||      z  } |S )Ng        )zipr
   )r   vectors1vectors2r@   uvs         r   r)   zKMeansClusterer._sum_distances   s7    
(+DAq$..A..J ,r   c                 ~   | j                   r2t        j                  |      }|D ]  }||z  }	 |dt        |      z   z  S t        |      s@t        j                  j                  d       t        j                  j                  d       J t        j                  |d         }|dd  D ]  }||z  }	 |t        |      z  S )Nr   z.Error: no centroid defined for empty cluster.
z4Try setting argument 'avoid_empty_clusters' to True
r   )r   copyr   sysstderrwrite)r   clusterrE   centroidr=   s        r   r9   zKMeansClusterer._centroid   s    %%yyH!F" "q3w</00w<

  !RS

  K uyy,H!!"+F" &c'l**r   c                 8    d| j                   | j                  fz  S )Nz%<KMeansClusterer means=%s repeats=%d>)r   r   rH   s    r   __repr__zKMeansClusterer.__repr__   s    6$++t}}9UUUr   )r   gư>NFNNF)F)__name__
__module____qualname____doc__r   r4   r%   r7   rI   r.   r)   r9   rY    r   r   r   r      sN     ".:`$@(8	#+$Vr   r   c                     ddl m} m} ddgddgddgddgfD cg c]  }t        j                  |       }}ddgd	d	gg} | d||
      }|j                  |dd      }t        d|       t        d|       t        d|j                                t                ddgddgddgddgddgddgfD cg c]  }t        j                  |       }} | d|d      }|j                  |d      }t        d|       t        d|       t        d|j                                t                t        j                  ddg      }t        d|z  d       t        |j                  |             t                y c c}w c c}w )Nr   )r   euclidean_distance   r                  )r   T)r+   z
Clustered:zAs:zMeans:
   )r   zclassify(%s): )end)	nltk.clusterr   r`   numpyarrayrV   r!   r.   classify)r   r`   fr*   r.   	clustererr<   r=   s           r   demorp      s{    A)*AAAA'GH'G!u{{1~'GGHVaVE#5UKI  $d ;H	, 	%	(IOO%&	G)*AAAAAQRTUPV'WX'W!u{{1~'WGX
  #5rBI  $/H	, 	%	(IOO%&	G [[!Q F	/F
",	)

V
$%	G7 I Ys   E30E8__main__)
rR   r   rS   rk   ImportErrornltk.cluster.utilr   r   rp   rZ   r^   r   r   <module>rt      s^      
	
 3gV* gVZ F zF o  		s   4 <<