3
@6^                 @   s8   d dl Z d dlZd dlmZmZmZ G dd deZdS )    N)
SpiderFootSpiderFootPluginSpiderFootEventc               @   sd   e Zd ZdZddiZddiZdZe ZdZ	e
 fddZd	d
 Zdd Zdd Zdd Zdd ZdS )sfp_commoncrawlzeCommonCrawl:Footprint,Passive:Crawling and Scanning::Searches for URLs found through CommonCrawl.org.indexes   zYNumber of most recent indexes to attempt, because results tend to be occasionally patchy.NFc             C   sF   || _ | j | _t | _d| _x"t|j D ]}|| | j|< q,W d S )NF)sftempStorageresultslist	indexBase
errorStatekeysopts)selfsfcuserOptsopt r   :/var/www/spiderfoot.crq.systems/modules/sfp_commoncrawl.pysetup&   s    
zsfp_commoncrawl.setupc             C   s   t  }x| jD ]|}d| d | d }| jj|ddd}|d dkr\| jjdd d| _d S |d s|| jjdd d| _d S |j|d  qW |S )Nzhttps://index.commoncrawl.org/z-index?url=z/*&output=json<   r   )timeout	useragentcode400401402403404z0CommonCrawl search doesn't seem to be available.FTcontent)r   r   r   r   r   )r   r   r   fetchUrlerrorr   append)r   targetretindexurlresr   r   r   search/   s    
zsfp_commoncrawl.searchc             C   sL  d}| j j|ddd}|d dkr<| j jdd d| _t S |d s^| j jdd d| _t S tjd|d }d}t }x(|D ] }|jddjdd}d||< q~W t	t|j
 ddd| jd  }t|| jd k r| j jdd d| _t S t }	x<|D ]4}
|	jdt|
dd  d t|
dd    qW | j jdt|	  |	S )NzDhttps://commoncrawl.s3.amazonaws.com/cc-index/collections/index.htmlr   r   )r   r   r   r   r   r   r   r   z:CommonCrawl index collection doesn't seem to be available.FTr    z.*(CC-MAIN-\d+-\d+).*r   zCC-MAIN- -)reverser   z,Not able to find latest CommonCrawl indexes.   r   zCommonCrawl indexes: )r   r   r   r   r   )r   r!   r"   r   r   refindalldictreplacesortedr   r   lenr#   strdebug)r   r'   r(   r   ZhighestZ	indexlistmmsZ
topindexesZretindexir   r   r   getLatestIndexesE   s6    

"
4z sfp_commoncrawl.getLatestIndexesc             C   s   dgS )NINTERNET_NAMEr   )r   r   r   r   watchedEventsi   s    zsfp_commoncrawl.watchedEventsc             C   s   dgS )NLINKED_URL_INTERNALr   )r   r   r   r   producedEventso   s    zsfp_commoncrawl.producedEventsc             C   s  |j }|j}|j}| jjd| d |  | jr4d S || jkrBd S d| j|< t| jdkrd| j	 | _| js|| jj
dd d S t| jdkr| jj
dd d S | j|}|s| jj
dd d S t }x|D ]}yx~|jdD ]p}| j rd S t|d	k rqtj|}	d
|	krq|	d
 |krq|j|	d
  td|	d
 | j|}
| j|
 qW W q tk
r } z| jj
dt| d d S d }~X qX qW d S )NzReceived event, z, from Tr   z"Unable to fetch CommonCrawl index.Fz*Unable to obtain content from CommonCrawl.
   r'   r<   z%Malformed JSON from CommonCrawl.org: )	eventTypemoduledatar   r5   r   r
   r3   r   r9   r"   r)   r   splitcheckForStopjsonloadsr#   r   __name__notifyListenersBaseExceptionr4   )r   event	eventNamesrcModuleName	eventDatarB   sentr    linelinkevter   r   r   handleEvents   sR    








zsfp_commoncrawl.handleEvent)rG   
__module____qualname____doc__r   optdescsr
   r   r   r   r0   r   r)   r9   r;   r=   rS   r   r   r   r   r      s   	$r   )r.   rE   sflibr   r   r   r   r   r   r   r   <module>   s   