Два вопроса по регулярным выражениям

Наги · Фев 6, 2021

Всем доброго времени суток!
На сей раз прихожу с двумя небольшими вопросами. Выложу код целиком - он довольно большой, но вопросы совсем маленькие.
Вот код:

Python:

from bs4 import BeautifulSoup
import requests
import re
from astropy.table import Table


def write_tab(lst_data):
    # b_name, starttime, starttimeUT, duration, fluence, peak_fl, mode, redshift
    lst_names = "GCN Name Time TimeHHMMSS Duration Fluence PFscale Mode Z(Redshift)".split()
 
    tab = Table(rows=lst_data, names=lst_names)
    tab.write('Burst_info.txt', format='ascii.fixed_width', delimiter='', overwrite=True)

def parce_line(string):

    string = string.replace('\n', ' ')

    m = re.search('(GRB\s*\d{6}\w |GRB\s*\d{6} |SGR\s*\d+.\d+ |SGR\d+.\d+ |\d+\s*\SGR)', string)
    if not m is None:
        b_name = m.group(1).replace(' ', '')
    else:
        b_name = '--'
    print(b_name)

    if re.search('SGR', b_name):
        print("SGR burst - skipping")
        return None

    m = re.search('in\s+the\s+waiting\s+mode', string)
    if m:
        print("Waiting mode - skipping")
        return None

    m = re.search('(T0\s*=\s*(\d+\.\d+) | starting at\s*(\d+)\s* | T0....\s*=\s*(\d+) | T0.....\s*=\s*(\d+)'
                   '| T0\s*=\s*(\d+) | at\s*(\d+\.\d+)\s* | T0....\s*=\s*(\d+\.\d+)'
                   '| T0\s*=\s*(\d+\.\d+) | \(T0....\s*=\s*(\d+\.\d+) | T0\s*=\s*(\d+\.\d+)\w)', string)

    if not m is None:
        starttime = m.group().replace('T0=', '').replace(' at ', '').replace(' ', '').replace('T0(KW)=', '').replace('(', '').replace('s', '')
    else:
        starttime = '--'
    print(starttime)

    m = re.search('in\s+the\s+waiting\s+mode', string)
    if m:
        print("Waiting mode - skipping")
        return None

    m = re.search('(UT..(\d{2}:\d{2}:\d{2}.\d+)|T0.........(\d{2}:\d{2}:\d{2})|T0......(\d{2}:\d{2}:\d{2})'
                  '|UT..(\d{2}:\d{2}:\d{2})|T0.......(\d{2}:\d{2}:\d{2}.\d+)'
                  '|T0.......(\d{2}:\d{2}:\d{2})|(UT...(\d{2}:\d{2}:\d{2}.\d+))'
                  '|(\d..(\d{2}:\d{2}:\d{2}.\d+))|at.(\d{2}:\d{2}:\d{2}.\d+))', string)

    if not m is None:
        starttimeUT = m.group().replace('T0=T0(BAT)=','').replace('UT (','').replace('T0(BAT)= ','').replace('T0(BAT)=','').replace('T0(MAXI)=','').replace('UT  (','').replace('5 (','').replace('at ','')
    else:
        starttimeUT = '--'
    print(starttimeUT)

    m = re.search('((duration\s+(?:of|is)\s*~\s*(\d+(?:\.\d+)?)\s*(s|ms))|(duration of the burst is\s*.(\d+\.\d+)\s*(s|ms))'
                  '|(duration of the burst is\s*.(\d+)\s*(s|ms))|(duration of\s*(\d+\.\d+)\s*(s|ms))'
                  '|(duration of\s*.(\d+)\s*(s|ms))|(burst\s......\s\w\w\s*.(\d+)\s*(s|ms))|(duration\s*.(\d+)\s*(s|ms))'
                  '|(duration\s......\s\w{2}\s*.(\d+)\s*(s|ms))|(of\s\w{5}\s*.(\d+)\s*(s|ms))'
                  '|(\d{4}\s\w{3}.\s\w\w\s*.(\d+\.\d+)\s*(s|ms))|(duration\s\w{2}\s\w{5}\s(\d+\.\d+)\s*(s|ms))'
                  '|(duration\s\w{2}\s\w{5}\s.(\d+\.\d+)\s*(s|ms))|(duration\s\w{2}\s\w{3}\s\w{5}.\s\w{2}\s*.(\d+)\s*(s|ms))'
                  '|(duration\s\w{2}\s\w{5}\s(\d+)\s*(s|ms))|(\d{4}\s\w{3}.\s\w\w\s*.(\d+)\s*(s|ms))'
                  '|(duration\s\w{2}\s\w{3}\s\w{5}\s\w{2}\s\w{2}\s*.(\d+\.\d+)\s*(s|ms))|(about\s*.(\d+)\s*(s|ms))'
                  '|(duration\s\w{2}\s\w{3}\s\w{5}\s\w{2}\s*.(\d+)\s*.(s|ms))|(duration\s\w{2}\s\w{3}\s\w{5}\s*.(\d+)\s*.(s|ms))'
                  '|(\d{3}\s\w{3}.\s\w\w\s*.(\d+)\s*(s|ms))|(duration\s.(\d+\.\d+)\s*(s|ms)))', string)

    if not m is None:
        duration = m.group(1).replace('duration of ~', '').replace('duration of the burst is ~', '').replace('duration of~', '').replace('duration is ~', '').replace('duration  of ~', '').replace(' ', '').replace('burst(T100)is~', '').replace('duration(T100)of~', '').replace('durationof', '').replace('ofabout', '').replace('`', '').replace('1300keV)is~', '').replace('360keV)is~', '').replace('1400keV)of~', '').replace('theburstis~', '').replace('durationisabout', '').replace('duration~', '').replace('about', '').replace('theburstinis~', '').replace('~', '').replace('theburstof', '').replace('theburst', '')

    else:
        duration = '--'
    print(duration)

    m = re.search('(fluence\s+of\s+(.+?)\s*erg|\d{4}\s\w{3}\s\w{4}\s\w{2}\s+(..................)'
                  '|interval\s\w{2}\s+(...................)|fluence\s+(.+?)\s*erg)'
                  '|fluence\s\w{2}\s+(.......)\s+erg/cm2', string)

    if not m is None:
        fluence = m.group(1).replace('1500 keV band is ','').replace('fluence of ','').replace(' erg','').replace('interval is ','').replace(' e','').replace('fluence ','').replace('this part is ','').replace('is ','').replace('the most intense part of the burst ','').replace(' ','').replace('~','').replace('theburstsapproximately','')
    else:
        fluence = '--'

    print(fluence)

    peak_fl = re.search(('\d{1}.\d{1,4}-s|\d{1,4}-ms|measured\s\w\w\s\w\s\d{1}.\d{1,3}|cm2\s\w{3}\s\w\s\d{1}.\d{1,3}'
                         '|followed\s\w{2}\s.\d{1,3}|and\s\w\s\d{1,3}-s|started\s\w{2}\s\d{1,3}|over\s\d{1}.\d{1,3}'), string)

    if peak_fl != None:   
        peak_fl = peak_fl.group(0)
        if peak_fl.find('-ms') != -1:
            peak_fl = float(peak_fl.replace('-ms', ''))/ 1000
            #peak_fl = float(peak_fl.replace('msec', ''))/ 1000
            peak_fl = str(peak_fl)
        peak_fl = peak_fl.replace('-ms', '').replace('-s', '').replace('measured on a ', '').replace('cm2 and a ', '').replace('followed in ~', '').replace('and a ', '').replace('started at ', '').replace('over ', '')
        print(peak_fl)
    else:
        peak_fl = '--'
        print(peak_fl)

    m = re.search('triggered|waiting\s*mode', string)
    if not m is None:
        mode = m.group(0).replace(' ', '')
    else:
        mode = '--'
    print(mode)

    m = re.search(r'z\s*=\s*(\d+(?:\.\d+)?)', string)
    if m is None:
        redshift = '--'
    else:
        redshift = m.group(1)
        
    print(redshift)
 
    return [b_name, starttime, starttimeUT, duration, fluence, peak_fl, mode, redshift]

def main():

    with open("links.txt") as f:
        lines = f.read().split('\n')
    
    lst_data = []
    for line in lines:
        line = line.strip()
        print(line)
        str_gcn = line[-10:-5]
        response = requests.get(line)
        soup = BeautifulSoup(response.content, "lxml")
        ptag = soup.find(lambda tag: tag.name == 'p')
    
        string = str(ptag)

        res = parce_line(string)
        if not res is None:
            lst_data.append([str_gcn,] + res)
    
    write_tab(lst_data)

main()

Вопрос 1:
В fluence неправильно отображается одна строчка. Вот из этого файла: https://gcn.gsfc.nasa.gov/gcn3/5570.gcn3 должно браться из строки:
"The burst fluence is ~4x10-6 erg/cm2" значение: "4x10-6". Но почему-то сейчас берется что-то вообще не то: "intervalT0-(T0+8s).Theb". Подскажите, пожалуйста, как настроить правильно отображение этого значения? Ну и чтобы другие значения из других файлов от этого не стали неправильно отображаться...
Вопрос 2:
В peak_fl не могу нормально настроить отображение значений из нескольких файлов:
https://gcn.gsfc.nasa.gov/gcn3/4564.gcn3

https://gcn.gsfc.nasa.gov/gcn3/4542.gcn3

https://gcn.gsfc.nasa.gov/gcn3/4439.gcn3

https://gcn.gsfc.nasa.gov/gcn3/4394.gcn3

Там все нужные мне значения объединены тем, что после них идет пробел и msec. Например отсюда:
"peak flux measured from T0+0.448 sec on 64 msec time scale" мне надо взять "64", за которым идет msec. И вот не пойму, как это сделать через регулярные выражения( А msec надо обязательно, чтоб осталось, тк потом там дальше это значение нужно будет делить на 1000 этой строкой: #peak_fl = float(peak_fl.replace('msec', ''))/ 1000.
Также прикладываю список ссылок, с которыми работает программа. И буду благодарная за любую помощь!

regnor · Фев 6, 2021

Наги сказал(а):
Вопрос 1:

проблема в том, что re.search прерывает поиск после первого совпадения, в случае ссылки https://gcn.gsfc.nasa.gov/gcn3/5570.gcn3 строка где есть шаблон '(interval\s\w{2}\s+(...................)|fluence\s+(.+?)\s*erg)' находиться раньше всех, поэтому он ее и выдает, можно сделать так, проверил только на пяти ссылках которые в вашем топике, потому что полностью файл сидеть смотреть это с ума сойти)) сами посмотрите подкорректируете если что не так, код ниже

Python:

...

    m = re.search('(fluence\s+of\s+(.+?)\s*erg|\d{4}\s\w{3}\s\w{4}\s\w{2}\s+(..................)'
                   '|fluence\s\w{2}\s+(.......)\s+erg/cm2)', string)
    m2 = re.search('(interval\s\w{2}\s+(...................)|fluence\s+(.+?)\s*erg)', string)

    if not m is None:
        fluence = m.group().replace('1500 keV band is ', '').replace('fluence of ', '').replace(' erg', '').replace(
            'interval is ', '').replace(' e', '').replace('fluence ', '').replace('this part is ', '').replace('is ',
                                                                                                               '').replace(
            'the most intense part of the burst ', '').replace(' ', '').replace('~', '').replace(
            'theburstsapproximately', '')
    elif not m2 is None:
        fluence = m2.group().replace('1500 keV band is ', '').replace('fluence of ', '').replace(' erg', '').replace(
            'interval is ', '').replace(' e', '').replace('fluence ', '').replace('this part is ', '').replace('is ',
                                                                                                               '').replace(
            'the most intense part of the burst ', '').replace(' ', '').replace('~', '').replace(
            'theburstsapproximately', '')
    else:
        fluence = '--'

    print(fluence)

...

Наги сказал(а):
Вопрос 2:

так же проверил только на пяти ссылках из топа, код ниже

Python:

...

    peak_fl = re.search(('\d{1}.\d{1,4}-s|\d{1,4}-ms|measured\s\w\w\s\w\s\d{1}.\d{1,3}|cm2\s\w{3}\s\w\s\d{1}.\d{1,3}'
                         '|followed\s\w{2}\s.\d{1,3}|and\s\w\s\d{1,3}-s|started\s\w{2}\s\d{1,3}|over\s\d{1}.\d{1,3}'
                         '|\d{1,3}\smsec'), string)

    if peak_fl != None:
        peak_fl = peak_fl.group()
        if peak_fl.find('-ms') != -1:
            peak_fl = float(peak_fl.replace('-ms', '')) / 1000
            peak_fl = str(peak_fl)
        if peak_fl.find('msec') != -1:
            peak_fl = float(peak_fl.replace('msec', '')) / 1000
            peak_fl = str(peak_fl)
        peak_fl = peak_fl.replace('-ms', '').replace('-s', '').replace('measured on a ', '').replace('cm2 and a ',
                                                                                                     '').replace(
            'followed in ~', '').replace('and a ', '').replace('started at ', '').replace('over ', '')
        print(peak_fl)
    else:
        peak_fl = '--'
        print(peak_fl)
       
...

Наги · Фев 6, 2021

regnor сказал(а):

проблема в том, что re.search прерывает поиск после первого совпадения, в случае ссылки https://gcn.gsfc.nasa.gov/gcn3/5570.gcn3 строка где есть шаблон '(interval\s\w{2}\s+(...................)|fluence\s+(.+?)\s*erg)' находиться раньше всех, поэтому он ее и выдает, можно сделать так, проверил только на пяти ссылках которые в вашем топике, потому что полностью файл сидеть смотреть это с ума сойти)) сами посмотрите подкорректируете если что не так, код ниже

Python:

...

    m = re.search('(fluence\s+of\s+(.+?)\s*erg|\d{4}\s\w{3}\s\w{4}\s\w{2}\s+(..................)'
                   '|fluence\s\w{2}\s+(.......)\s+erg/cm2)', string)
    m2 = re.search('(interval\s\w{2}\s+(...................)|fluence\s+(.+?)\s*erg)', string)

    if not m is None:
        fluence = m.group().replace('1500 keV band is ', '').replace('fluence of ', '').replace(' erg', '').replace(
            'interval is ', '').replace(' e', '').replace('fluence ', '').replace('this part is ', '').replace('is ',
                                                                                                               '').replace(
            'the most intense part of the burst ', '').replace(' ', '').replace('~', '').replace(
            'theburstsapproximately', '')
    elif not m2 is None:
        fluence = m2.group().replace('1500 keV band is ', '').replace('fluence of ', '').replace(' erg', '').replace(
            'interval is ', '').replace(' e', '').replace('fluence ', '').replace('this part is ', '').replace('is ',
                                                                                                               '').replace(
            'the most intense part of the burst ', '').replace(' ', '').replace('~', '').replace(
            'theburstsapproximately', '')
    else:
        fluence = '--'

    print(fluence)

...

так же проверил только на пяти ссылках из топа, код ниже

Python:

...

    peak_fl = re.search(('\d{1}.\d{1,4}-s|\d{1,4}-ms|measured\s\w\w\s\w\s\d{1}.\d{1,3}|cm2\s\w{3}\s\w\s\d{1}.\d{1,3}'
                         '|followed\s\w{2}\s.\d{1,3}|and\s\w\s\d{1,3}-s|started\s\w{2}\s\d{1,3}|over\s\d{1}.\d{1,3}'
                         '|\d{1,3}\smsec'), string)

    if peak_fl != None:
        peak_fl = peak_fl.group()
        if peak_fl.find('-ms') != -1:
            peak_fl = float(peak_fl.replace('-ms', '')) / 1000
            peak_fl = str(peak_fl)
        if peak_fl.find('msec') != -1:
            peak_fl = float(peak_fl.replace('msec', '')) / 1000
            peak_fl = str(peak_fl)
        peak_fl = peak_fl.replace('-ms', '').replace('-s', '').replace('measured on a ', '').replace('cm2 and a ',
                                                                                                     '').replace(
            'followed in ~', '').replace('and a ', '').replace('started at ', '').replace('over ', '')
        print(peak_fl)
    else:
        peak_fl = '--'
        print(peak_fl)
    
...

Спасибо огромное!! С peak_fl все идеально) А вот с fluence чуть-чуть проблемы возникли( 5570 теперь отлично работает, но немного съехало https://gcn.gsfc.nasa.gov/gcn3/21857.gcn3. Там получается теперь: of 1.60(-0.19,+0.28)x10^-5. И вот это "of" почему-то не убирается через .replace('of ', ''). И еще немного съехало теперь https://gcn.gsfc.nasa.gov/gcn3/3474.gcn3. Там получается: thetriggerburstwas8500countsinthesamenerg вместо (7.84 +/- 0.06)10-5. В остальном все отлично) Но вот эти 2 строчки...(
Код по fluence немного добавила:

Python:

    m = re.search('(fluence\s+of\s+(.+?)\s*erg|\d{4}\s\w{3}\s\w{4}\s\w{2}\s+(..................)'
                   '|fluence\s\w{2}\s+(.......)\s+erg/cm2)', string)
    m2 = re.search('(interval\s\w{2}\s+(...................)|fluence\s+(.+?)\s*erg)', string)

    if not m is None:
        fluence = m.group().replace('1500 keV band is ', '').replace('fluence of ', '').replace(' erg', '').replace(
            'interval is ', '').replace(' e', '').replace('fluence ', '').replace('this part is ', '').replace('is ',
                                                                                                               '').replace(
            'the most intense part of the burst ', '').replace(' ', '').replace('~', '').replace(
            'theburstsapproximately', '').replace('/cm2', '')
    elif not m2 is None:
        fluence = m2.group().replace('1500 keV band is ', '').replace('fluence of ', '').replace(' erg', '').replace(
            'interval is ', '').replace(' e', '').replace('fluence ', '').replace('this part is ', '').replace('is ',
                                                                                                               '').replace(
            'the most intense part of the burst ', '').replace(' ', '').replace('~', '').replace(
            'theburstsapproximately', '').replace('/cm2', '')
    else:
        fluence = '--'

    print(fluence)

regnor · Фев 7, 2021

попробуйте так

Python:

...

    m = re.search('(fluence\s+of\s+(.+?)\s*erg|\d{4}\s\w{3}\s\w{4}\s\w{2}\s+(..................)'
                  '|fluence\s\w{2}\s+(.......)\s+erg/cm2'
                  '|fluence\s(...................)\s+erg/cm2)', string)
    m2 = re.search('(interval\s\w{2}\s+(...................)|fluence\s+(.+?)\s*erg)', string)

    if not m is None:
        fluence = m.group().replace('1500 keV band is ', '').replace('fluence of ', '').replace(' erg', '').replace(
            'interval is ', '').replace(' e', '').replace('fluence ', '').replace('this part is ', '')\
            .replace('is ', '').replace('the most intense part of the burst ', '').replace(' ', '').replace('~', '')\
            .replace('theburstsapproximately', '').replace('/cm2', '').replace('of', '')
    elif not m2 is None:
        fluence = m2.group().replace('1500 keV band is ', '').replace('fluence of ', '').replace(' erg', '').replace(
            'interval is ', '').replace(' e', '').replace('fluence ', '').replace('this part is ', '')\
            .replace('is ', '').replace('the most intense part of the burst ', '').replace(' ', '').replace('~', '')\
            .replace('theburstsapproximately', '').replace('/cm2', '')
    else:
        fluence = '--'

    print(fluence.strip())
    
...

Наги · Фев 7, 2021

regnor сказал(а):

попробуйте так

Python:

...

    m = re.search('(fluence\s+of\s+(.+?)\s*erg|\d{4}\s\w{3}\s\w{4}\s\w{2}\s+(..................)'
                  '|fluence\s\w{2}\s+(.......)\s+erg/cm2'
                  '|fluence\s(...................)\s+erg/cm2)', string)
    m2 = re.search('(interval\s\w{2}\s+(...................)|fluence\s+(.+?)\s*erg)', string)

    if not m is None:
        fluence = m.group().replace('1500 keV band is ', '').replace('fluence of ', '').replace(' erg', '').replace(
            'interval is ', '').replace(' e', '').replace('fluence ', '').replace('this part is ', '')\
            .replace('is ', '').replace('the most intense part of the burst ', '').replace(' ', '').replace('~', '')\
            .replace('theburstsapproximately', '').replace('/cm2', '').replace('of', '')
    elif not m2 is None:
        fluence = m2.group().replace('1500 keV band is ', '').replace('fluence of ', '').replace(' erg', '').replace(
            'interval is ', '').replace(' e', '').replace('fluence ', '').replace('this part is ', '')\
            .replace('is ', '').replace('the most intense part of the burst ', '').replace(' ', '').replace('~', '')\
            .replace('theburstsapproximately', '').replace('/cm2', '')
    else:
        fluence = '--'

    print(fluence.strip())
   
...

Спасибо! Теперь все совсем идеально))))
Но я за сегодня себе еще проблему сделала( Если не затруднит... я тогда тоже тут спрошу... Я продолжаю работу, все с тем же файлом, но другим параметром. Там идет выборка следующего вида. Из строчек типа такой: measured from T0+8.096 s, of 1.86(-0.35,+0.36)x10^-5 erg/cm2/s выделяется "1.86(-0.35,+0.36)x10^-5". Там не везде именно такое написание, но я более-менее продвинулась тут, сделав где-то 2/3 моих ссылок, но дальше все пошло совсем плохо. Там каждая ссылка стала выдавать слишком много текста. Я сначала пыталась записать его в .replace, но там еще много и... это слишком как-то. Может быть можно мой код как-то немного оптимизировать, чтоли?
Вот он:

Python:

from bs4 import BeautifulSoup
import requests
import re
from astropy.table import Table


def write_tab(lst_data):
    # peak_fl, peak_fl_starttime
    lst_names = "PFti PF".split()
 
    tab = Table(rows=lst_data, names=lst_names)
    tab.write('Burst_info.txt', format='ascii.fixed_width', delimiter='', overwrite=True)

def parce_line(string):

    string = string.replace('\n', ' ')

    m = re.search(('\s*s,\s*of\s*(.+?)\s*erg|\s*,\s*of\s*(.+?)\s*erg'
                   '|\d{2}\.\d{3}\s\w\s\w{2}\s(.+?)\s*erg/cm2/s,'
                   '|\w{8}\s\w{4}\s...\d+\.\d+\s\w\s\w{2}\s(.+?)\s*erg/cm2'
                   '|\d\.\d{3}\s\w\s\w{2}\s*(.+?)\s*erg'
                   '|\d+\w\s*of\s*(.+?)\s*erg'), string)

    if m is None:
        peak_fl = '--'
    else:
        peak_fl = m.group().replace(' s, of ','').replace(' erg','').replace(', of ','').replace('63.107 s is ','').replace('/cm2/s,','').replace('175s of ','').replace('measured from T0+18.752 s of ','').replace('/cm2','').replace('measured from T0+8.640 s of ','').replace('measured from T0+2.048 s of ','').replace('s','').replace('meaured from T0+3.536  of ','').replace('meaured from T0+4.304  of ','').replace('meaured from T0+1.296  of ','').replace('meaured from T0+18.128  of ','').replace('meaured from T0-0.320  of ','').replace('meaured from T0+211.6  of ','').replace('048 of ','').replace('096 of ','').replace('632 of ','').replace('344 of ','').replace('0.896  of ','').replace('024 of ','').replace('584 of ','(').replace('768 of ','(').replace('7.799  UT (18:32:27.799).  The burt light curve conit of two main peak eparated by ~100 m, with a total duration of the burt of ~350 m. The Konu-Wind light curve of thi GRB i available at http://www.ioffe.ru/LEA/GRB/GRB100625_T66747/  A oberved by Konu-Wind the burt had a fluence of ','(').replace('880 of ','').replace('256 of ','').replace('3.610  UT (01:48:03.610).  The burt light curve how a main pule with a duration of ~30   followed by a much weaker and ofter pule at ~T0+44  with a duration  of ~15 .  A oberved by Konu-Wind the burt had a fluence of ','').replace('528 of ','').replace('864 of ','').replace('744 of ','').replace('176 of ','').replace('144 of ','').replace('168 of ','').replace('9.449  UT (23:25:49.449).  The burt light curve conit of two main peak, and a third, ofter peak. A total duration of the burt ~10 . The Konu-Wind light curve of thi GRB i available at http://www.ioffe.ru/LEA/GRB/GRB091127_T84349/  A oberved by Konu-Wind the burt had a fluence of ','').replace('6.183  UT (04:34:36.183).  The burt light curve conit of everal peak with a total duration of ~55 . The Konu-Wind light curve of thi GRB i available at http://www.ioffe.ru/LEA/GRB/GRB091120_T16476/ Coniderable hard to oft pectral evolution i noticed during the initial part of the burt.  A oberved by Konu-Wind the burt had a fluence of ','').replace('3.801  UT (04:35:43.801).  The burt light curve how a multipeaked tructure with a total duration of ~23 .  A oberved by Konu-Wind the burt had a fluence of ','').replace('7.790  UT (04:33:07.790).  A oberved by Konu-Wind the burt had a fluence of ','').replace('5.377  UT (10:09:25.377).  The burt light curve how a multipeaked tructure with a total duration of ~20 .  A oberved by Konu-Wind the burt had a fluence of ','').replace('0.962  UT (04:32:10.962) and Konu-RF at T0(KRF)=16332.013   UT (04:32:12.013).  The burt light curve how a multipeaked tructure with a duration of ~60   A oberved by Konu-Wind the burt had a fluence of ','').replace('meaured from T0+0.002  of ','').replace('2.051  UT (23:26:32.051).  A oberved by Konu-Wind the burt had a fluence of ','').replace('6.325  UT (08:59:26.325).  The burt light curve how a oft weak precuror at ~T0-20 , the main  multipeaked pule at T0 with a duration of ~45 , and a weak oft pule  at ~T0+70 with a duration of ~30  (the ame lc tructure wa een by  BAT: Copete et al., GCN 9159).  A oberved by Konu-Wind the burt had a fluence of ','').replace('2.964  UT (08:35:22.964).  The burt light curve how a multipeaked tructure with a duration of ~11 . There i a hint of an extended oft emiion.  A oberved by Konu-Wind the burt had a fluence of ','').replace('0.845  UT (00:02:50.845).  The burt light curve how a weak emiion tarting at ~T0-120  followed by the main multipeaked part at ~T0-24, which had a duration  of ~40. There i a hint of an earlier emiion.  A oberved by Konu-Wind the burt had a fluence of ','').replace('9.486  UT (09:36:49.486).  The burt light curve how a multipeaked tructure with a duration of ~80 . There i a hint of an extended oft emiion.  A oberved by Konu-Wind the burt had a fluence of ','')

        print(peak_fl)

    return [peak_fl]

def main():

    with open("links.txt") as f:
        lines = [line.rstrip('\n') for line in f]
    
    lst_data = []
    for line in lines:
        line = line.strip()
        print(line)
        str_gcn = line[-10:-5]
        response = requests.get(line)
        soup = BeautifulSoup(response.content, "lxml")
        ptag = soup.find(lambda tag: tag.name == 'p')
    
        string = str(ptag)

        lst_data.append([str_gcn,] + parce_line(string))
    
    write_tab(lst_data)

main()

Еще я пробовала иным методом, вот такой строчкой:
m = re.search(('(\d+(?:\.\d+)?)[-\s+](ms|s)\s+peak.+?flux.+?T0(.+?)\s*s,\s*of\s*(.+?)\s*erg'), string)
Потому что мне в общем-то потом и Т0 понадобится. Но там больших успехов не добилась и сосредоточилась на том варианте, что выше. Он с начала ссылок и до примерно 12 000 неплохо работает, а вот дальше все хуже и хуже...

regnor · Фев 7, 2021

дайте пример ссылки где совсем плохо

Наги · Фев 7, 2021

regnor сказал(а):
дайте пример ссылки где совсем плохо

В общем вот эти строчки работаю судя по всему как надо:
m = re.search(('\s*s,\s*of\s*(.+?)\s*erg|\s*,\s*of\s*(.+?)\s*erg'
'|\d{2}\.\d{3}\s\w\s\w{2}\s(.+?)\s*erg/cm2/s,'
'|\w{8}\s\w{4}\s...\d+\.\d+\s\w\s\w{2}\s(.+?)\s*erg/cm2'), string)
А вот в дальнейших ошибка. Возникла она начиная с этого номера: https://gcn.gsfc.nasa.gov/gcn3/21247.gcn3. По идее, если решить проблему там, может и дальше пойдет нормально. Пока же тот номер и примерно с этого: https://gcn.gsfc.nasa.gov/gcn3/10948.gcn3 - вот там все плохо.

Наги · Фев 7, 2021

regnor сказал(а):
дайте пример ссылки где совсем плохо

Я переписала свой код. Теперь он все корректно выдает с начала до https://gcn.gsfc.nasa.gov/gcn3/10948.gcn3. За исключением следующих номеров:
https://gcn.gsfc.nasa.gov/gcn3/21247.gcn3

https://gcn.gsfc.nasa.gov/gcn3/21630.gcn3

https://gcn.gsfc.nasa.gov/gcn3/11441.gcn3

https://gcn.gsfc.nasa.gov/gcn3/11439.gcn3

https://gcn.gsfc.nasa.gov/gcn3/11350.gcn3

https://gcn.gsfc.nasa.gov/gcn3/11127.gcn3

https://gcn.gsfc.nasa.gov/gcn3/11119.gcn3

Начиная с 10948 и до конца совсем плохо все(

Python:

    m = re.search(('\s*s,\s*of\s*(.+?)\s*erg|\s*,\s*of\s*(.+?)\s*erg'
                   '|\d{2}\.\d{3}\s\w\s\w{2}\s(.+?)\s*erg/cm2/s,'
                   '|\w{8}\s\w{4}\s...\d+\.\d+\s\w\s\w{2}\s(.+?)\s*erg/cm2'), string)

    if m is None:
        peak_fl = '--'
    else:
        peak_fl = m.group().replace(' s, of ','').replace(' erg','').replace(', of ','').replace('s','').replace('63.107  i','').replace('/cm2/,','').replace(' ','').replace('/cm2','').replace('meauredfromT0+18.752of','').replace('meauredfromT0+8.640of','').replace('meauredfromT0+2.048of','').replace('meauredfromT0+3.536of','').replace('meauredfromT0+4.304of','').replace('meauredfromT0+1.296of','').replace(' erg/cm2','').replace('meauredfromT0+18.128of','').replace('meauredfromT0-0.320of','').replace('meauredfromT0+211.6of','')

        print(peak_fl)

    return [peak_fl]

regnor · Фев 7, 2021

ну проблема такая же, например в ссылке https://gcn.gsfc.nasa.gov/gcn3/21247.gcn3 шаблон '|\d{2}\.\d{3}\s\w\s\w{2}\s(.+?)\s*erg/cm2/s,' находиться раньше всех, он и показывает его, если вы обратите внимание, 2 цифры, потом любой символ кроме перевода строки (у нас это точка), потом 3 цифры и так далее...
вам нужно делить поиск в re.search, либо использовать другие инструменты поиска (re.findall , например), если вам не срочно, завтра посмотрю на работе в обед, или вечером дома, отпишусь тогда... сегодня как то лениво))

regnor · Фев 9, 2021

в общем проблема именно в этом, вот пример для ссылки https://gcn.gsfc.nasa.gov/gcn3/21247.gcn3

Python:

m = re.search(('\(\d{1}.\d{1}\s.\s\d{1}.\d{2}..\d{2}..\d{1}'), string)

занимает подбор этих шаблонов очень много времени...
вам наверное придется для каждой ссылки подбирать...

Поиск

Поиск

Два вопроса по регулярным выражениям

Наги

Пользователь

Вложения

regnor

Модератор

Наги

Пользователь

regnor

Модератор

Наги

Пользователь

regnor

Модератор

Наги

Пользователь

Наги

Пользователь

regnor

Модератор

regnor

Модератор