o
    c,                     @   sj  d dl Z d dlmZ d dlZd dlmZ d dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlZd dlZd dlmZ edZedZdd Zdd	 Zd
d Zdd Zdd Zdd Zd:ddZdd Zdd Zdd Zdd Zdd Zd:dd Zd!d" Zd#d$ Z d;d%d&Z!		d<d(d)Z"d*d+ Z#d=d,d-Z$d;d.d/Z%d:d0d1Z&d>d2d3Z'd4d5 Z(d?d6d7Z)d8d9 Z*dS )@    N)datetime)copyfilez7It\s+was\s+last\s+changed\s+on\s+\d{4}.\d{2}.\d{2}\.\s*z\s+c                  C   s.   t  } t d}dt|  d | }|S )N%sz/tmp/_)osgetpidr   nowstrftimestr)pidtist	warc_path r   !/home/ernad/ernad/python/filer.pytemp_warc_path   s   r   c              
   C   sf   t jt j| s1zt t j|  W d S  ty0 } z|jtjkr% W Y d }~d S d }~ww d S N)r   pathexistsdirnamemakedirsOSErrorerrnoEEXIST)filenameexcr   r   r   prepare   s   r   c                 C   sp   t jddj}t| }|j|dd t| tj	|r,t
j||ddr,t| d S t|| t| d S )NF)deleteT)pretty_print)shallow)tempfileNamedTemporaryFilenameetreeElementTreewriter   r   r   r   filecmpcmpremover   )elefufitemp_file_nameetr   r   r   install_xml'   s   


r,   c                 C   s   d}t | zt| }W n ty   | }Y nw tj|s.t|d}|| |  dS t|d}|	 }t
|}t
|}W d    n1 sJw   Y  ||krUdS t|d}|| |  dS )N w   rr   )r   r
   	Exceptionr   r   r   openr$   closereadnormalize_for_change)html
there_fufistring
there_file
old_string
new_stringr   r   r   install_html4   s.   




r<   c                 C   s8   t | d}| }W d    |S 1 sw   Y  |S )Nr0   r2   r4   )r)   filer8   r   r   r   slurpL   s   

r?   c                 C   s   t d| }td|}|S )Nr-   )
re_changedsubre_whitespace)r8   norm_stringr   r   r   r5   R   s   r5   Fc                 C   s0   d}|rd}t j| }t|}||}|S )Nz%Y-%m-%du   %Y‒%m‒%d)r   r   getmtimer   utcfromtimestampr	   )r)   prettydate_formattimemdater   r   r   rI   X   s   

rI   c              	   C   s  | dd  dkrFt | 0}zt|}W n tjjy+   td|  tjd i }Y n	w W d    |S W d    |S 1 s?w   Y  |S t	| d9}|
 }|d}zt|}W n tjjyt   td|  tjd i }Y n	w W d    |S W d    |S 1 sw   Y  |S )N.gzzfiler can't decode r>   r0   utf-8)r2   jsonloaddecoderJSONDecodeErrorprintsysstderrgzipGzipFiler4   decodeloads)r)   the_filedata	json_file
json_bytesjson_stringr   r   r   rO   b   sB   





rO   c                 C   s   t | tr
| }|}n|}| }t| |dd  dkr:t|d}tj||dd W d    |S 1 s3w   Y  |S tj|dddd}t	|d}|
| ~W d    |S 1 s]w   Y  |S )	NrJ   rK   r.   r/   )indentF)ensure_asciir^   rM   )
isinstancer
   r   r2   rN   dumpdumpsencoderU   rV   r$   )a1a2r)   rZ   rY   dump_stringr   r   r   ra   v   s*   



ra   c                 C   s   t j| }t|}|S r   )r   r   rD   int)r)   mtimer   r   r   rh      s   rh   c                 C   s"   t j| }t|}t|}|S r   )r   r   rD   rg   shotisermake)r)   rh   mshotir   r   r   shoti   s   
rl   c                 C   s*   t j| }t d}t|| }|S )Nr   )r   r   rD   r   r   r	   rg   )r)   rh   r   ager   r   r   rm      s   rm   c              	   C   s   |rt d| d |   t| d-}t|d}||  W d    n1 s)w   Y  W d    d S W d    d S 1 sAw   Y  d S )Nz	I append z to abrb)rR   r2   r$   r4   )canonic_fufisupplem_fufi
do_verbosecanonic_filesupplem_filer   r   r   concat   s   "ru   c                 C   s6   t j| st| d t| d}| }|  |S )N is not there.br)r   r   isfiler1   r2   r4   r3   r)   rY   r8   r   r   r   bread   s   
rz   c                 C   s   t j| st| d | dd  dkr0t| d}| }W d    |S 1 s)w   Y  |S t| d}| }| }W d    n1 sIw   Y  |	  |S )Nrv   rJ   rK   r0   )
r   r   rx   r1   r2   r4   rU   rV   rW   r3   ry   r   r   r   sread   s   


r{   c                 C   s`   t j| rt| }||kr|rtd|   dS t| d}|| |  |r.td|   dS )NzNo change in FbwzI write T)r   r   rx   rz   rR   r2   r$   r3   )r)   r8   with_backuprr   r:   rY   r   r   r   brite   s   

r~   Tc           	      C   s@  |du rt j| }nt|tr|}nt|tr|}|r4t j| r4t| }||kr4|r2td|   dS t	| d}| dd  dkr\t	| d}|
| W d    n1 sVw   Y  nt	| d}|
|  W d    n1 stw   Y  |  |rtd|   t j| st|  d	|d urt | ||f dS )
NTzfiler: I keep Fr.   rJ   rK   wbzfiler: I wrote z should have been written.)r   r   rD   r`   floatrg   rx   r{   rR   r2   r$   rU   rc   r3   r1   utime)	r)   r8   	do_backuprr   do_change_checkdo_preserve_timerH   r:   rY   r   r   r   srite   s:   


r   c                 C   s,   t j| }|dd }|d dkrd S |S )Nr   
      -)r   r   basename)r)   bana	issuedater   r   r   get_issuedate	  s
   r   c                 C   s@   t j| s|r
dS td|  t | }|j}||krdS dS )NTzfiler needs F)r   r   rx   r1   statst_size)r)   allow_missingmin_sizeinfosizer   r   r   is_empty  s   
r   c           
      C   s   t j| s|rtd|   dS t | }|j}|dkr)|s)|r't|  d dS |j}t|ts5t	d|D ],}t j|sItd| t
jd q7t |}|j}	|	|krc|r`t|  d|   dS q7|rltd	|   d
S )zdoes need renewalzfiler.donore does not see Tr   z	 is emptyz%filer.donere needs a list of in_fufisz donere does not see the in_fufi rL   z older than zfiler.donere skips F)r   r   rx   rR   r   r   st_mtimer`   listr1   rS   rT   )
out_fufiin_fufisrr   do_allow_emptyout_infoout_size	out_mtimein_fufiin_infoin_mtimer   r   r   donere  s:   


r   c                 C   s   t j| }t j| }|d dkr(t|d tjd td|d  tjd dS |dd }d	}|d
r8d
}ntd| tjd dS |d | d | }t|D ]}|| krYqR|ratd|  t 	| qRd S )Nr   r   z, does not look like a filename to clear fromrL   zcheck digit is    Fr   r-   z.json.gzz)I can't handle the extension you have on /z_*z	I remove )
r   r   r   r   rR   rS   rT   endswithglobr'   )r)   rr   r   fudir   restglob_string
found_fufir   r   r   clear_by_issuedate>  s4   
r   c                 C   s   t j| }|rtd| d|   |d | }|d ur&| d| d| }t|D ]}|| kr:|r9td|   q+|rBtd|  t | q+d S )Nzfiler clears z from z/*r   *zfiler.clear_by_extension keeps z!filer.clear_by_extension removes )r   r   r   rR   r   r'   )r)   extrr   prefixr   r   r   r   r   r   clear_by_extensionX  s   r   c                 C   sV   t j| r| S | dr| d d }t j|r|S d S | d }t j|r)|S d S )NrK   rJ   )r   r   rx   r   )r)   fufi_without_gzfufi_with_gzr   r   r   is_therei  s   
r   c                 C   sv   |d u r
t jdd}z	t | |}W |S  ty'   td|  tjd Y d S  t jy:   t| d tjd Y d S w )NT)remove_blank_textzfiling can not open rL   z is not well formed)r"   	XMLParserparser   rR   rS   rT   XMLSyntaxError)r)   parserdocr   r   r   	parse_laxw  s   r   c                 C   s<   t | d}|ddkW  d    S 1 sw   Y  d S )Nro      s   r=   )filepathtest_fr   r   r   is_gz  s   $r   )F)FF)FFTN)Fr   )FNr   )+rU   r   r   
lxml.etreer"   r   r   r%   rer   rN   ri   rS   shutilr   compiler@   rB   r   r   r,   r<   r?   r5   rI   rO   ra   rh   rl   rm   ru   rz   r{   r~   r   r   r   r   r   r   r   r   r   r   r   r   r   <module>   sT    

	


	
-
 
	



