
    >"0g6                        d dl Z d dlmZ d dlZd dlmZ d dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlZd dlZd dlmZ  ej                  d      Z ej                  d      Zd Zd Zd Zd	 Zd
 Zd Zd%dZd Zd Zd Zd Zd Zd%dZd Zd Z d&dZ!	 	 d'dZ"d Z#d(dZ$d&dZ%d%dZ&d)dZ'd Z(d*dZ)d*dZ*d Z+d  Z,d! Z-d" Z.d# Z/d+d$Z0y),    N)datetime)copyfilez7It\s+was\s+last\s+changed\s+on\s+\d{4}.\d{2}.\d{2}\.\s*z\s+c                      t        j                         } t        j                         j	                  d      }dt        |       z   dz   |z   }|S )N%sz/tmp/_)osgetpidr   nowstrftimestr)pidtist	warc_paths      !/home/ernad/ernad/python/filer.pytemp_warc_pathr      sA    
))+C<<>""4(D#c("S(4/I    c                 L   t         j                  j                  t         j                  j                  |             s4	 t        j                  t         j                  j                  |              y y # t
        $ r(}|j                  t        j                  k7  r Y d }~y d }~ww xY wN)r   pathexistsdirnamemakedirsOSErrorerrnoEEXIST)filenameexcs     r   preparer      sl    77>>"''//(34	KK12 5  	yyELL( )	s   2A2 2	B#;BB#c                    t        j                  d      j                  }t        | t        j
                        st	        j                  |       }n| }|j                  |d       t        |       t        j                  j                  |      r.t        j                  ||d      rt        j                  |       y t        ||       t        j                  |       y )NF)deleteT)pretty_print)shallow)tempfileNamedTemporaryFilename
isinstanceetree_ElementTreeElementTreewriter   r   r   r   filecmpcmpremover   )ingestfufitemp_file_namedocs       r   install_xmlr2   '   s    00>CCNfe001'IIn4I0DM	ww~~d;;~tT:IIn%^T"IInr   c                    d}t        |       	 t        |       }t        j                  j                  |      s.t        |d      }|j                  |       |j                          yt        |d      5 }|j                         }t        |      }t        |      }d d d        k(  ryt        |d      }|j                  |       |j                          y# t        $ r | }Y w xY w# 1 sw Y   NxY w)N w   rr   )r   r   	Exceptionr   r   r   openr*   closereadnormalize_for_change)html
there_fufistring
there_file
old_string
new_strings         r   install_htmlrC   7   s    FJT 77>>*%*c*
 	j#	*__&
)*5
)&1
 
 Zj#&JV!   
	s   C 3'C'C$#C$'C0c                 j    t        | d      5 }|j                         }d d d        |S # 1 sw Y   S xY wNr7   r9   r;   )r/   filer?   s      r   slurprH   O   s-    	dCD 
M 
Ms   (2c                 ^    t         j                  d|       }t        j                  d|      }|S )Nr4   )
re_changedsubre_whitespace)r?   norm_strings     r   r<   r<   U   s*    ..V,K##B4Kr   c                     d}|rd}t         j                  j                  |       }t        j                  |      }|j                  |      }|S )Nz%Y-%m-%du   %Y‒%m‒%d)r   r   getmtimer   utcfromtimestampr   )r/   prettydate_formattimemdates        r   rT   rT   [   sF    K+77D!D$$T*DMM+&ELr   c                 Z   | dd  dk7  r,t        |       5 }	 t        j                  |      }d d d        |S t        j                  | d      5 }|j                         }|j                  d      }	 t        j                  |      }d d d        |S # t        j                  j                  $ r# t        d|  t        j                         i }Y w xY w# 1 sw Y   S xY w# t        j                  j                  $ r# t        d|  t        j                         i }Y w xY w# 1 sw Y   S xY w)N.gzzfiler can't decode rG   r7   utf-8)r9   jsonloaddecoderJSONDecodeErrorprintsysstderrgzipGzipFiler;   decodeloads)r/   the_filedata	json_file
json_bytesjson_strings         r   r[   r[   e   s   BCyE$Z8yy*  	tS	!Y^^%
 ''0	::k*D	 
" K <<// +D62D   ||++ 	'v.SZZ@D	 
" KsR   CB"D .C=C
CCCC=DD DD  D*c                    t        | t              r| }|}n|}| }t        |       |dd  dk7  r/t        |d      5 }t	        j
                  ||d       d d d        |S t	        j                  |dd      j                  d      d	z   }t        j                  |d      5 }|j                  |       ~d d d        |S # 1 sw Y   |S xY w# 1 sw Y   |S xY w)
NrV   rW   r5   r6   )indentF)ensure_asciirk   rY      
)r&   r   r   r9   rZ   dumpdumpsencodera   rb   r*   )a1a2r/   rf   re   dump_strings         r   rn   rn   y   s    "cDMBCyE$_IIdHQ/ **T$%''-vg?K	tS	!X{# 
" K  
" Ks   B9C9CCc                 Z    t         j                  j                  |       }t        |      }|S r   )r   r   rO   int)r/   mtimes     r   rv   rv      s$    GGT"EJELr   c                     t         j                  j                  |       }t        |      }t	        j
                  |      }|S r   )r   r   rO   ru   shotisermake)r/   rv   mshotis      r   shotir{      s2    GGT"EJE]]5!FMr   c                     t         j                  j                  |       }t        j                         j                  d      }t        |      |z
  }|S )Nr   )r   r   rO   r   r
   r   ru   )r/   rv   r
   ages       r   r}   r}      s?    GGT"E
,,.
!
!$
'C
c(U
CJr   c                     |rt        d|z   dz   | z          t        | d      5 }t        |d      5 }|j                  |j                                d d d        d d d        y # 1 sw Y   xY w# 1 sw Y   y xY w)Nz	I append z to abrb)r^   r9   r*   r;   )canonic_fufisupplem_fufi
do_verbosecanonic_filesupplem_files        r   concatr      sh    kL(61L@A	lD	!\,%|0023 & 
"	!%% 
"	!s"   A- A!A-!A*	&A--A6c                     t         j                  j                  |       st        | dz         t	        | d      }|j                         }|j                          |S )N is not there.br)r   r   isfiler8   r9   r;   r:   r/   re   r?   s      r   breadr      sI    77>>$//00D$H]]_FNNMr   c                    t         j                  j                  |       st        | dz         | dd  dk7  r't	        | d      5 }|j                         }d d d        |S t        j                  | d      5 }|j                         }|j                         }d d d        j                          S # 1 sw Y   S xY w# 1 sw Y   (xY w)Nr   rV   rW   r7   )
r   r   r   r8   r9   r;   ra   rb   rc   r:   r   s      r   sreadr      s    77>>$//00BCyE$_]]_F  M	 ]]4%]]_F]]_F & 	M  M	 &%s   B.3!B;.B8;Cc                     t         j                  j                  |       r!t        |       }||k(  r|rt	        d| z          yt        | d      }|j                  |       |j                          |rt	        d| z          y)NzNo change in FbwzI write T)r   r   r   r   r^   r9   r*   r:   )r/   r?   with_backupr   rA   re   s         r   briter      sk    	ww~~d4[
o,-D$HNN6NNj4 r   c                 
   |du r t         j                  j                  |       }n%t        |t              r|}nt        |t
              r|}|r@t         j                  j                  |       r!t        |       }||k(  r|rt        d| z          yt        | d      }| dd  dk7  r't        | d      5 }|j                  |       d d d        n>t        j                  | d      5 }|j                  |j                                d d d        |j                          |rt        d| z          t         j                  j                  |       st        |  d	      |t        j                  | |f       y# 1 sw Y   qxY w# 1 sw Y   }xY w)
NTzfiler: I keep Fr5   rV   rW   wbzfiler: I wrote z should have been written.)r   r   rO   r&   floatru   r   r   r^   r9   r*   ra   rp   r:   r8   utime)	r/   r?   	do_backupr   do_change_checkdo_preserve_timerS   rA   re   s	            r   sriter      s?   D ww%	$e	,	$c	*277>>$/4[
&-.D#HBCyE$_NN6" _ YYtT"hNN6==?+ #NN$&'77>>$4& :;<<#
d|$ _ #"s   ,E- E9-E69Fc                 `    t         j                  j                  |       }|dd }|d   dk7  ry |S )Nr   
      -)r   r   basename)r/   bana	issuedates      r   get_issuedater     s6    77D!DQr
I|sr   c                     t         j                  j                  |       s|ryt        d| z         t        j                  |       }|j
                  }||k  ryy)NTzfiler needs F)r   r   r   r8   statst_size)r/   allow_missingmin_sizeinfosizes        r   is_emptyr     sL    77>>$NT122774=D<<Dxr   c                 H   t         j                  j                  |       s|rt        d| z          yt        j                  |       }|j
                  }|dk(  r|s|rt        |  d       y|j                  }t        |t              st        d      |D ]}  }|t         j                  j                  |      st        d|z   t        j                         Dt        j                  |      }|j                  }	|	|kD  sk|rt        |  d|         y |rt        d	| z          y
)zdoes need renewalzfiler.donore does not see Tr   z	 is emptyz%filer.donere needs a list of in_fufisz donere does not see the in_fufi rX   z older than zfiler.donere skips F)r   r   r   r^   r   r   st_mtimer&   listr8   r_   r`   )
out_fufiin_fufisr   do_allow_emptyout_infoout_size	out_mtimein_fufiin_infoin_mtimes
             r   donerer   #  s
   77>>(#.9:wwx HH1}^XJi()!!Ih%?@@?ww~~g&4w>zz#'''"##i
,wi89  #h./r   c                 *   t         j                  j                  |       }t         j                  j                  |       }|d   dk7  r@t	        |dz   t
        j                         t	        d|d   z   t
        j                         y|dd }d	}|j                  d
      rd
}nt	        d|z   t
        j                         y|dz   |z   dz   |z   }t        j                  |      D ]-  }|| k(  r	|rt	        d|z          t        j                  |       / y )Nr   r   z, does not look like a filename to clear fromrX   zcheck digit is    Fr   r4   z.json.gzz)I can't handle the extension you have on /z_*z	I remove )
r   r   r   r   r^   r_   r`   endswithglobr-   )r/   r   r   fudir   restglob_string
found_fufis           r   clear_by_issuedater   E  s    77D!D77??4 DBx3dCC::	$r(*::	Qr
ID}}Z 9D@::	*y(4/$6Kii,
+
*+
		* -r   c                 :   t         j                  j                  |       }|rt        d| d|         |dz   |z   }|
| d| d| }t	        j                  |      D ]=  }|| k(  r|rt        d| z          |rt        d|z          t        j
                  |       ? y )Nzfiler clears z from z/*r   *zfiler.clear_by_extension keeps z!filer.clear_by_extension removes )r   r   r   r^   r   r-   )r/   extr   prefixr   r   r   s          r   clear_by_extensionr   _  s    77??4 DcU&/0+#Kaxq.ii,
7$>?5
BC
		* -r   c                    t         j                  j                  |       r| S | j                  d      r'| d d }t         j                  j                  |      r|S y | dz   }t         j                  j                  |      r|S y )NrW   rV   )r   r   r   r   )r/   fufi_without_gzfufi_with_gzs      r   is_therer   p  sm    	ww~~d}}Us)77>>/*""
  e|77>>,'r   c                 "   |t        j                  d      }	 t        j                  | |      }|S # t        $ r! t	        d|  t
        j                         Y y t         j                  $ r! t	        | dz   t
        j                         Y y w xY w)NT)remove_blank_textzfiling can not open rX   z is not well formed)r'   	XMLParserparser   r^   r_   r`   XMLSyntaxErrorr/   parserr1   s      r   	parse_laxr     s    ~48kk$' J  $TF+#**= d**<s   2 'B0BBc                 ^    |t        j                         }t        j                  | |      }|S r   )r'   r   r   r   s      r   parse_strictr     s,    ~" ++dF
#C Jr   c                 n    t        | d      5 }|j                  d      dk(  cd d d        S # 1 sw Y   y xY w)Nr      s   rF   )filepathtest_fs     r   is_gzr     s/    	h	{{1~,- -		s   +4c                 ^    t        | d      }|j                         }|j                          |S rE   )r9   	readlinesr:   )r/   re   liness      r   read_as_linesr     s*    D#H ENNLr   c                     t        | d      }|j                         }g }|D ]  }|j                  |d d         |j                          |S )Nr7   )r9   r   appendr:   )r/   re   r   outlines        r   
read_linesr     sK    D#H E
C

49 NNJr   c                    | j                  d      }|j                  d      }d}|t        t        |      t        |            k  r3||   ||   k7  rn'|dz  }|t        t        |      t        |            k  r3||d  }||d  }dt        |      dz
  z  dj                  |      z   }t        j
                  j                  |      rt	        j                  |       t	        j                  ||       y )Nr   r   r6   z../)	splitminlenjoinr   r   lexistsr-   symlink)	orig_fufi	dest_fufi	orig_dirs	dest_dirscommon_countr   s         r   	link_fufir     s    $I$IL
S^S^<
<\"i&== S^S^<
< ,-(I,-(IC	NQ&'#((9*==D	wwy!
		)JJtYr   c                 r    d}t        | d      5 }t        d |D              }d d d        |S # 1 sw Y   |S xY w)Nr   r   c              3       K   | ]  }d   yw)r6   N ).0r   s     r   	<genexpr>zcount_lines.<locals>.<genexpr>  s     .X!Xs   )r9   sum)r/   count_linesre   s      r   r   r     s:    K	dD	X.X.. 
 
s   ,6c                     t        |        t        j                  |       }d}|D ],  }|dz  }|rt        d|        t        j                  |       . |S )Nr   r6   zfiler removes )r^   r   r   r-   )r   r   fufiscount_filesr/   s        r   clear_by_globr     sV    	+IIk"EKqN4&)*
		$	 
 r   )F)FF)FFTN)Fr   )FNr   )T)1ra   r   r   
lxml.etreer'   r   r#   r+   rer   rZ   rx   r_   shutilr   compilerJ   rL   r   r   r2   rC   rH   r<   rT   r[   rn   rv   r{   r}   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   <module>r      s      	     	    
  RZZ @ A


6" 0((4Z 5:15@
D4" -
8  	r   